# CatBoost pipeline

Attention à la version de numpy : 1.26.4 ( au lieu de 2.2.2) pour que ça fonctionne ici

In [7]:
import pandas as pd 

dataframe = pd.read_csv("SBAnational.csv", low_memory=False)

D'abord effectuer les conversions, avant de supprimer des colonnes

In [8]:
import conversion_functions as conv
import math

print (f"Before : {dataframe.shape[0]} lines")

explanable_X = pd.DataFrame(dataframe)

explanable_X["State"] = explanable_X["State"].apply(lambda x : conv.get_state_code(x))
explanable_X["NAICS"] = explanable_X["NAICS"].apply(lambda x : conv.get_NAICS_data(x))

explanable_X["ApprovalFY"] = explanable_X["ApprovalFY"].apply(lambda x : conv.get_ApprovalFY_data(x)) 
mean_dataframe = explanable_X[ explanable_X["ApprovalFY"] !=50]
mean_value = mean_dataframe["ApprovalFY"].mean()
explanable_X.loc[explanable_X['ApprovalFY'] == 50, 'ApprovalFY'] = math.ceil(mean_value)

explanable_X["NewExist"] = explanable_X["NewExist"].apply(lambda x : conv.get_NewExist_data(x)) 
explanable_X["FranchiseCode"] = explanable_X["FranchiseCode"].apply(lambda x : conv.get_FranchiseCode_data(x)) 

explanable_X = explanable_X.drop("UrbanRural", axis=1)

explanable_X["RevLineCr"] = explanable_X["RevLineCr"].apply(lambda x : conv.get_RevLineCr_data(x)) 
explanable_X["LowDoc"] = explanable_X["LowDoc"].apply(lambda x : conv.get_LowDoc_data(x)) 

explanable_X["GrAppv"] = explanable_X["GrAppv"].apply(lambda x : conv.get_GrAppv_value(x)) 
mean_dataframe = explanable_X[ explanable_X["GrAppv"] !=0]
mean_value = mean_dataframe["GrAppv"].mean()
explanable_X.loc[explanable_X['GrAppv'] == 0, 'ApprovalFY'] = math.ceil(mean_value)

explanable_X = explanable_X[ explanable_X["GrAppv"] !=0] 

explanable_X = explanable_X.drop("SBA_Appv", axis = 1)

# explanable_X = explanable_X[(explanable_X["MIS_Status"]=="P I F") |
#                               (explanable_X["MIS_Status"]=="CHGOFF")]

explanable_X['MIS_Status'] = explanable_X.apply(lambda row: conv.predict_MIS_Status_data(row), axis=1 )
explanable_X["MIS_Status"] = explanable_X["MIS_Status"].apply(lambda x : conv.get_MIS_Status_data(x))

print (f"After : {explanable_X.shape[0]} lines")


Before : 899164 lines
After : 899164 lines


Ne supprimer les colonnes qu'après les conversions ( qui utilisent les colonnes supprimées )

In [9]:
explanable_X = explanable_X.drop('LoanNr_ChkDgt', axis=1)
explanable_X = explanable_X.drop('Name', axis=1)
explanable_X = explanable_X.drop('City', axis=1)
explanable_X = explanable_X.drop('Zip', axis=1)
explanable_X = explanable_X.drop('Bank', axis=1)
explanable_X = explanable_X.drop('BankState', axis=1)
explanable_X = explanable_X.drop('ApprovalDate', axis=1)
explanable_X = explanable_X.drop('ChgOffDate', axis=1) # (explicit end on simulation?) : included 
explanable_X = explanable_X.drop('DisbursementDate', axis=1)  
explanable_X = explanable_X.drop('DisbursementGross', axis=1)  
explanable_X = explanable_X.drop('BalanceGross', axis=1)  # 14 valeurs seulement différentes de zéro  ?
explanable_X = explanable_X.drop('ChgOffPrinGr', axis=1)

In [10]:
explanable_X["MIS_Status"].value_counts()

MIS_Status
P I F     741392
CHGOFF    157772
Name: count, dtype: int64

In [11]:
import numpy
import pandas as pd 

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder, LabelBinarizer, OneHotEncoder, Binarizer, FunctionTransformer, PolynomialFeatures,MinMaxScaler

preprocessor = make_pipeline(  
    make_column_transformer(
        # LoanNr_ChkDgt, Name , City : excluded
        (OneHotEncoder(), ["State"]), 
        # Zip , Bank , BankState : excluded
        (OneHotEncoder(), ["NAICS"]), #included (2 first chars )
        # ApprovalDate : excluded
        (StandardScaler(), ["ApprovalFY"]),
        (StandardScaler(), ["Term"]), 
        (StandardScaler(), ["NoEmp"]),
        (StandardScaler(), ["NewExist"]),
        (StandardScaler(), ["CreateJob"]),
        (StandardScaler(), ["RetainedJob"]),
        (Binarizer(), ["FranchiseCode"]),
        #Binarizer("UrbanRural", threshold=1.5),
        (OneHotEncoder(), ["RevLineCr"]),
        (Binarizer(), ["LowDoc"]),
        # SimpleImputer("ChgOffDate"), # explicit end on simulation
        # DisbursementDate, DisbursementGross , BalanceGross ( 14 valeurs seulement différentes de zéro  ?)
        # MIS_Status : Excluded because it is Y !!!
        # ChgOffPrinGr : excluded
        (StandardScaler(), ["GrAppv"]), 
        #(StandardScaler(),["SBA_Appv"]),  
        remainder='passthrough'))

In [14]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from catboost import CatBoostClassifier, Pool
from scipy.stats import uniform, randint

# CatBoost prend en charge les données catégorielles nativement
categorical_features = ['State','NAICS', 'RevLineCr']

y = explanable_X['MIS_Status']
X = explanable_X.drop('MIS_Status', axis=1)

# Split en train et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Préparation des données pour CatBoost (en spécifiant les caractéristiques catégorielles)
# Convertir les données d'entraînement en un objet Pool pour indiquer les variables catégorielles (si nécessaire)
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)

test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)


In [10]:
# Initialize CatBoostClassifier
cat_boost_model = CatBoostClassifier(learning_rate=0.1, iterations=100, depth=6, cat_features=categorical_features)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', cat_boost_model)])

# Define the hyperparameters grid for RandomizedSearch
param_dist = {
    'iterations': randint(50, 200),       # Number of boosting iterations
    'depth': randint(4, 10),              # Depth of the trees
    'learning_rate': uniform(0.01, 0.3),  # Learning rate
    'l2_leaf_reg': uniform(1, 10),        # L2 regularization coefficient
    #'border_count': randint(32, 255),     # Number of splits for categorical features
    'bagging_temperature': uniform(0, 1), # Controls the randomness in bagging
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(cat_boost_model, param_distributions=param_dist, 
                                   n_iter=10, scoring='roc_auc', 
                                   cv=5, verbose=1, n_jobs=-1)



# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Meilleurs paramètres trouvés : ", random_search.best_params_)
print("Meilleur score AUC : ", random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
0:	learn: 0.6180725	total: 664ms	remaining: 1m 40s
0:	learn: 0.6179822	total: 833ms	remaining: 2m 6s
1:	learn: 0.5588105	total: 1.57s	remaining: 1m 58s
0:	learn: 0.6186817	total: 1.06s	remaining: 2m 41s
0:	learn: 0.6180394	total: 1.17s	remaining: 2m 57s
1:	learn: 0.5586683	total: 1.56s	remaining: 1m 58s
2:	learn: 0.5112903	total: 2.43s	remaining: 2m 1s
0:	learn: 0.6189649	total: 1.36s	remaining: 3m 26s
0:	learn: 0.6797410	total: 1.06s	remaining: 2m 56s
1:	learn: 0.5593267	total: 2.24s	remaining: 2m 48s
3:	learn: 0.4726390	total: 3.13s	remaining: 1m 56s
0:	learn: 0.5566470	total: 1.01s	remaining: 1m 1s
0:	learn: 0.5560146	total: 2.09s	remaining: 2m 7s
1:	learn: 0.6668412	total: 1.81s	remaining: 2m 29s
0:	learn: 0.5565914	total: 1.97s	remaining: 1m 59s
2:	learn: 0.5110613	total: 3.13s	remaining: 2m 36s
1:	learn: 0.5587635	total: 2.85s	remaining: 3m 34s
2:	learn: 0.5117038	total: 3.39s	remaining: 2m 49s
0:	learn: 0.5566688	total



182:	learn: 0.1386212	total: 3m 43s	remaining: 9.77s
184:	learn: 0.1388435	total: 3m 43s	remaining: 7.24s
58:	learn: 0.1916323	total: 1m 22s	remaining: 37.9s
101:	learn: 0.1432837	total: 2m 35s	remaining: 41.2s
181:	learn: 0.1394520	total: 3m 45s	remaining: 11.2s
95:	learn: 0.1441788	total: 2m 28s	remaining: 51.1s
84:	learn: 0.1457934	total: 2m 7s	remaining: 1m 5s
154:	learn: 0.1404435	total: 3m 1s	remaining: 42.2s
81:	learn: 0.1468727	total: 2m 9s	remaining: 1m 14s
85:	learn: 0.1751947	total: 2m 5s	remaining: 0us
183:	learn: 0.1385560	total: 3m 44s	remaining: 8.54s
185:	learn: 0.1387757	total: 3m 44s	remaining: 6.04s
182:	learn: 0.1394114	total: 3m 46s	remaining: 9.9s
117:	learn: 0.1407201	total: 2m 55s	remaining: 16.3s
59:	learn: 0.1909590	total: 1m 24s	remaining: 36.5s
155:	learn: 0.1403505	total: 3m 2s	remaining: 41s
102:	learn: 0.1430879	total: 2m 36s	remaining: 39.6s
85:	learn: 0.1455900	total: 2m 8s	remaining: 1m 4s
82:	learn: 0.1467284	total: 2m 10s	remaining: 1m 12s
96:	learn:

In [16]:
# Initialize CatBoostClassifier
cat_boost_model = CatBoostClassifier(learning_rate=0.1, iterations=100, depth=6, cat_features=categorical_features)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', cat_boost_model)])

#Meilleur score AUC :  0.9771748465313973
param_dist = {
    'iterations': [191],       # Number of boosting iterations
    'depth': [7],              # Depth of the trees
    'learning_rate': [0.2621537355139263],  # Learning rate
    'l2_leaf_reg': [9.620670562880655],        # L2 regularization coefficient
    'bagging_temperature': [0.09440670531484197], # Controls the randomness in bagging
}


# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(cat_boost_model, param_distributions=param_dist, 
                                   n_iter=10, scoring='roc_auc', 
                                   cv=5, verbose=1, n_jobs=-1)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Meilleurs paramètres trouvés : ", random_search.best_params_)
print("Meilleur score AUC : ", random_search.best_score_)




Fitting 5 folds for each of 1 candidates, totalling 5 fits
0:	learn: 0.5381100	total: 404ms	remaining: 1m 16s
0:	learn: 0.5379234	total: 412ms	remaining: 1m 18s
0:	learn: 0.5382643	total: 384ms	remaining: 1m 13s
0:	learn: 0.5394523	total: 531ms	remaining: 1m 40s
1:	learn: 0.4464967	total: 870ms	remaining: 1m 22s
1:	learn: 0.4467908	total: 858ms	remaining: 1m 21s
1:	learn: 0.4466133	total: 942ms	remaining: 1m 29s
1:	learn: 0.4470616	total: 935ms	remaining: 1m 28s
0:	learn: 0.5384857	total: 526ms	remaining: 1m 39s
2:	learn: 0.3872242	total: 1.24s	remaining: 1m 17s
1:	learn: 0.4466464	total: 887ms	remaining: 1m 23s
2:	learn: 0.3867406	total: 1.32s	remaining: 1m 23s
2:	learn: 0.3865446	total: 1.41s	remaining: 1m 28s
2:	learn: 0.3860205	total: 1.48s	remaining: 1m 33s
3:	learn: 0.3443103	total: 1.64s	remaining: 1m 16s
3:	learn: 0.3438297	total: 1.71s	remaining: 1m 19s
2:	learn: 0.3867527	total: 1.35s	remaining: 1m 24s
3:	learn: 0.3459746	total: 1.9s	remaining: 1m 28s
4:	learn: 0.3124105	tota

In [19]:
# %matplotlib_inline
# import matplotlib.pyplot as plt
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

test_score = random_search.score(X_test, y_test)

# Prédictions sur le jeu de test
y_pred = random_search.predict(X_test)

# Évaluation du modèle
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Prédire sur l'ensemble de test
y_pred_proba = random_search.predict_proba(X_test)[:, 1]  # Probabilités pour la classe positive

# Calculer l'AUC sur l'ensemble de test
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC sur l'ensemble de test : ", auc)

# Calculer et afficher la matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)
print("Matrice de confusion :")
print(conf_matrix)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


# # Afficher la matrice de confusion
# print("Matrice de confusion :\n", conf_matrix)
# plt.figure(figsize=(6, 6))
# plot_confusion_matrix(random_search.best_estimator_, X_test, y_test, cmap=plt.cm.Blues)
# plt.title('Matrice de Confusion')
# plt.show()


Accuracy: 94.80%
AUC sur l'ensemble de test :  0.977771904536866
Matrice de confusion :
[[ 38998   8334]
 [  5694 216724]]
              precision    recall  f1-score   support

      CHGOFF       0.87      0.82      0.85     47332
       P I F       0.96      0.97      0.97    222418

    accuracy                           0.95    269750
   macro avg       0.92      0.90      0.91    269750
weighted avg       0.95      0.95      0.95    269750

