In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [3]:
# Load the data from the 'Loan_Data.csv' file into a DataFrame named df
df = pd.read_csv('Loan_Data.csv')
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [4]:
df.shape

(10000, 8)

In [5]:
df.dtypes

customer_id                   int64
credit_lines_outstanding      int64
loan_amt_outstanding        float64
total_debt_outstanding      float64
income                      float64
years_employed                int64
fico_score                    int64
default                       int64
dtype: object

Le DataFrame contient 10k observations et 8 variables, toutes numériques.

In [6]:
# Find columns with NaN values
# Count NaN values for each column
nan_counts = df.isna().sum()

# Filter and print only the columns with NaN values and their counts
nan_columns_counts = nan_counts[nan_counts > 0]
nan_columns_counts

Series([], dtype: int64)

La DataFrame ne contient aucun NaN.

In [7]:
def standard_columns(column):
    return (column - column.mean()) / column.std() 

# Standardize the floats variables
df[df.select_dtypes(["float"]).columns] = df[
    df.select_dtypes(["float"]).columns 
].apply(standard_columns)
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,0.747058,-0.724812,0.398535,5,605,0
1,7442532,5,-1.548297,-0.073963,-2.161768,2,572,1
2,2256073,0,-0.560481,-1.009645,-0.207909,4,602,0
3,4885975,0,0.427024,-0.938137,0.215073,5,612,0
4,4700614,1,-1.979634,-1.048728,-2.321198,6,631,0


In [33]:
# Avoid scientific digital format
pd.options.display.float_format = '{:.2f}'.format
df.describe()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4974577.0,1.46,-0.0,-0.0,0.0,4.55,637.56,0.19
std,2293889.71,1.74,1.0,1.0,1.0,1.57,60.66,0.39
min,1000324.0,0.0,-2.89,-1.31,-3.44,0.0,408.0,0.0
25%,2977661.0,0.0,-0.71,-0.68,-0.67,3.0,597.0,0.0
50%,4989501.5,1.0,-0.08,-0.3,0.0,5.0,638.0,0.0
75%,6967210.25,2.0,0.63,0.39,0.67,6.0,679.0,0.0
max,8999789.0,5.0,4.64,5.28,3.9,10.0,850.0,1.0


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

col_target = "default"
col_features = df.columns.tolist()[1:-1] # Get all the columns except the customer_id and the target columns

X = np.array(df[col_features]).astype(float) # Convert the features to a numpy array
y = np.array(df[col_target]) # Convert the target to a numpy array

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

# Créer le modèle de régression logistique
model = sm.Logit(y_train, X_train)

# Ajuster le modèle
result = model.fit()

# Obtenir le résumé du modèle
summary = result.summary()
print(summary)

# Faire des prédictions sur l'ensemble de test
y_pred_prob = result.predict(X_test)  # Probabilités prédites

# Convertir les probabilités en classes (0 ou 1) avec un seuil de 0.5
y_pred = (y_pred_prob > 0.5).astype(int)

# Évaluer le modèle
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Optimization terminated successfully.
         Current function value: 0.001306
         Iterations 21
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 6000
Model:                          Logit   Df Residuals:                     5994
Method:                           MLE   Df Model:                            5
Date:                Mon, 02 Sep 2024   Pseudo R-squ.:                  0.9973
Time:                        22:42:45   Log-Likelihood:                -7.8371
converged:                       True   LL-Null:                       -2873.3
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            98.3248     41.175      2.388      0.017      17.624     179.026
x2             8.2474      3

  return 1/(1+np.exp(-X))
  return 1/(1+np.exp(-X))


In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Définir les colonnes de caractéristiques et la colonne cible
col_target = "default"
col_features = df.columns.tolist()[1:-1]  # Get all the columns except the customer_id and the target columns

# Préparer les données
X = df[col_features].astype(float)
y = df[col_target]

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

# Créer le modèle de l'arbre de décision
model = DecisionTreeClassifier(random_state=42)

# Ajuster le modèle
model.fit(X_train, y_train)

# Faire des prédictions sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluer le modèle
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))




Confusion Matrix:
 [[3251    8]
 [  11  730]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3259
           1       0.99      0.99      0.99       741

    accuracy                           1.00      4000
   macro avg       0.99      0.99      0.99      4000
weighted avg       1.00      1.00      1.00      4000



In [15]:

# starts an MLflow server locally.
!mlflow server --host 127.0.0.1 --port 8082

^C


In [13]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor


In [16]:
# In order to connect to the tracking server, we’ll need to use the uri that we assigned the server when we started it.

client = MlflowClient(tracking_uri="http://127.0.0.1:8082")

#it allows programmatic interaction with the MLflow tracking server.

In [17]:
all_experiments = client.search_experiments()

print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1725310745644, experiment_id='0', last_update_time=1725310745644, lifecycle_stage='active', name='Default', tags={}>]


In [22]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "This is the loan default forecasting project. "
    "This experiment contains the produce models for bank loans."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "ML OPS Loan Default",
    "store_dept": "predict",
    "team": "Rodner Kallel Rigoni",
    "project_quarter": "Q3-2024",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
produce_loans_experiment = client.create_experiment(
    name="Loan_Default_Models", tags=experiment_tags
)


In [24]:
# Use search_experiments() to search on the project_name tag key

loans_experiment = client.search_experiments(
    filter_string="tags.`project_name` = 'ML OPS Loan Default'"
)

print(vars(loans_experiment[0]))

{'_experiment_id': '601071654062137473', '_name': 'Loan_Default_Models', '_artifact_location': 'mlflow-artifacts:/601071654062137473', '_lifecycle_stage': 'active', '_tags': {'mlflow.note.content': 'This is the loan default forecasting project. This experiment contains the produce models for bank loans.', 'project_name': 'ML OPS Loan Default', 'project_quarter': 'Q3-2024', 'store_dept': 'predict', 'team': 'Rodner Kallel Rigoni'}, '_creation_time': 1725311643292, '_last_update_time': 1725311643292}


In [26]:
# This function call sets the global tracking URI for the current session.
# It’s a convenient way to configure the tracking server URI without creating a separate client instance.
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:8082")

In [27]:
# Sets the current active experiment to the "Loan_Default_Models" experiment and
# returns the Experiment metadata
loan_experiment = mlflow.set_experiment("Loan_Default_Models")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "loans_rf_test"

# Define an artifact path that the model will be saved to.
artifact_path = "rf_loans"

In [28]:
data = df

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [31]:
# Split the data into features and target and drop irrelevant date field and target field
X = data.drop(columns=["customer_id", "default"])
y = data["default"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 888,
}

# Train the RandomForestRegressor
rf = RandomForestRegressor(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate error metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}

# Initiate the MLflow run context
with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=rf, input_example=X_val, artifact_path=artifact_path
    )


