## MLFLOW TRACKING

In [17]:

# starts an MLflow server locally.
!mlflow server --host 127.0.0.1 --port 8082

^C


In [18]:
from mlflow import MlflowClient
from pprint import pprint


In [19]:
# In order to connect to the tracking server, we’ll need to use the uri that we assigned the server when we started it.

client = MlflowClient(tracking_uri="http://127.0.0.1:8082")

#it allows programmatic interaction with the MLflow tracking server.

In [20]:
all_experiments = client.search_experiments()

print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1725612823132, experiment_id='0', last_update_time=1725612823132, lifecycle_stage='active', name='Default', tags={}>]


In [21]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "This is the loan default forecasting project. "
    "This experiment contains the produce models for bank loans."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "ML OPS Loan Default",
    "store_dept": "predict",
    "team": "Rodner Kallel Rigoni",
    "project_quarter": "Q3-2024",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
produce_loans_experiment = client.create_experiment(
    name="Loan_Default_Models", tags=experiment_tags
)


In [22]:
# Use search_experiments() to search on the project_name tag key

loans_experiment = client.search_experiments(
    filter_string="tags.`project_name` = 'ML OPS Loan Default'"
)

print(vars(loans_experiment[0]))

{'_experiment_id': '536545812455838994', '_name': 'Loan_Default_Models', '_artifact_location': 'mlflow-artifacts:/536545812455838994', '_lifecycle_stage': 'active', '_tags': {'mlflow.note.content': 'This is the loan default forecasting project. This experiment contains the produce models for bank loans.', 'project_name': 'ML OPS Loan Default', 'project_quarter': 'Q3-2024', 'store_dept': 'predict', 'team': 'Rodner Kallel Rigoni'}, '_creation_time': 1725612850593, '_last_update_time': 1725612850593}


In [23]:
# This function call sets the global tracking URI for the current session.
# It’s a convenient way to configure the tracking server URI without creating a separate client instance.
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:8082")

In [24]:
# Sets the current active experiment to the "Loan_Default_Models" experiment and
# returns the Experiment metadata
loan_experiment = mlflow.set_experiment("Loan_Default_Models")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "loans_rf_test"

# Define an artifact path that the model will be saved to.
artifact_path = "rf_loans"

## MODELS

In [1]:
# Importing the necessary libraries
import numpy as np
import pandas as pd

In [2]:
# Load the data from the 'Loan_Data.csv' file into a DataFrame named df
df = pd.read_csv('Cours_MLOPS/Loan_Data.csv')
df.head(2)

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1


#### DataFrame General Observation

In [3]:
df.shape

(10000, 8)

In [4]:
df.dtypes

customer_id                   int64
credit_lines_outstanding      int64
loan_amt_outstanding        float64
total_debt_outstanding      float64
income                      float64
years_employed                int64
fico_score                    int64
default                       int64
dtype: object

The DataFrame contains 10k observations and 8 variables, all of which are numerical.

#### Missing Values

In [5]:
# Count NaN values for each column
df.isna().sum()

customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64

The DataFrame does not contain any NaN values.

#### Studying the classes' balance

In [30]:
# Display class 1 percentage over the whole dataset
print("Class 1, Loan Default percentage :",round(df['default'].sum() / len(df)*100,2),"%")
print("Class 0, Loan Non-Default percentage :",100 - round(df['default'].sum() / len(df)*100,2),"%")

Class 1, Loan Default percentage : 18.51 %
Class 0, Loan Non-Default percentage : 81.49 %


#### DataFrame Statistics

In [7]:
# Avoid scientific digital format
pd.options.display.float_format = '{:.2f}'.format

# Display Statistics
df[['credit_lines_outstanding', 'loan_amt_outstanding',
       'total_debt_outstanding', 'income', 'years_employed', 'fico_score',
       'default']].describe()

Unnamed: 0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.46,4159.68,8718.92,70039.9,4.55,637.56,0.19
std,1.74,1421.4,6627.16,20072.21,1.57,60.66,0.39
min,0.0,46.78,31.65,1000.0,0.0,408.0,0.0
25%,0.0,3154.24,4199.84,56539.87,3.0,597.0,0.0
50%,1.0,4052.38,6732.41,70085.83,5.0,638.0,0.0
75%,2.0,5052.9,11272.26,83429.17,6.0,679.0,0.0
max,5.0,10750.68,43688.78,148412.18,10.0,850.0,1.0


#### Standardizing income and debts

In [8]:
# Create standardization function
def standard_columns(column):
    return (column - column.mean()) / column.std() 

# Standardize the floats variables
df[df.select_dtypes(["float"]).columns] = df[
    df.select_dtypes(["float"]).columns 
].apply(standard_columns)
df.head(2)

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,0.75,-0.72,0.4,5,605,0
1,7442532,5,-1.55,-0.07,-2.16,2,572,1


In [9]:
# Import tools necessary for all models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Set target and features columns
col_target = "default"
col_features = df.columns.tolist()[1:-1] # Get all the columns except the customer_id and the target columns

# Instanciate target and features into numpy array for all the models
X = np.array(df[col_features]).astype(float) # Convert the features to a numpy array
y = np.array(df[col_target]) # Convert the target to a numpy array

### First Model: Scikit-Learn Logistic Regression
With class weight balanced, and adding an intercept to the model

In [10]:
from sklearn.linear_model import LogisticRegression
from statsmodels.tools import add_constant

# Split the data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=43)

# Add a constant (intercept) to the feature matrices
X_train = add_constant(X_train)
X_test = add_constant(X_test)

# Create a logistic regression model with sklearn and handle class imbalance
log_reg = LogisticRegression(class_weight='balanced')

# Fit the model to the training data
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)  # Predicted classes (0 or 1)

# Get the predicted probabilities for each observation
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]  # Probabilities for class 1

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[2433   12]
 [   0  555]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2445
           1       0.98      1.00      0.99       555

    accuracy                           1.00      3000
   macro avg       0.99      1.00      0.99      3000
weighted avg       1.00      1.00      1.00      3000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Second Model: Statsmodels Logit

In [29]:
import statsmodels.api as sm

# Split the data into training and test set with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=43)

# Create a logistic regression model with statsmodels
model = sm.Logit(y_train, X_train,)

# Fit the model to the training data
result = model.fit()

# Get the model summary
summary = result.summary()
print(summary)

# Make predictions on the test set
y_pred_prob = result.predict(X_test)  # Predicted probabilities

# Convert probabilities into classes (0 or 1) with a 0.5 threshold
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Optimization terminated successfully.
         Current function value: 0.002094
         Iterations 19
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 7000
Model:                          Logit   Df Residuals:                     6994
Method:                           MLE   Df Model:                            5
Date:                Fri, 06 Sep 2024   Pseudo R-squ.:                  0.9956
Time:                        11:10:36   Log-Likelihood:                -14.660
converged:                       True   LL-Null:                       -3353.7
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            50.8079     13.239      3.838      0.000      24.861      76.755
x2             3.3195      1

### Third Model: Decision Tree Classifier

In [12]:
from sklearn.tree import DecisionTreeClassifier

# Split the data into training and test set with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=43)

# Create the decision tree model
model = DecisionTreeClassifier(random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Make prediction on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[2436    9]
 [   6  549]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2445
           1       0.98      0.99      0.99       555

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       1.00      0.99      1.00      3000



### Fourth Model: Random Forest

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Split the data into training and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 42,
}

# Train the RandomForestRegressor
rf = RandomForestRegressor(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate error metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}

# Initiate the MLflow run context
with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=rf, input_example=X_val, artifact_path=artifact_path
    )


In [27]:
# Display metrics
metrics

{'mae': 0.00713665030347078,
 'mse': 0.0027204137655599306,
 'rmse': 0.05215758588700143,
 'r2': 0.981957129726016}