# CatBoost pipeline

Attention à la version de numpy : 1.26.4 ( au lieu de 2.2.2) pour que ça fonctionne ici

In [1]:
import pandas as pd 

dataframe = pd.read_csv("SBAnational.csv", low_memory=False)

D'abord effectuer les conversions, avant de supprimer des colonnes

In [2]:
import conversion_functions as conv
import math

print (f"Before : {dataframe.shape[0]} lines")

explanable_X = pd.DataFrame(dataframe)

explanable_X["State"] = explanable_X["State"].apply(lambda x : conv.get_state_code(x))
explanable_X["NAICS"] = explanable_X["NAICS"].apply(lambda x : conv.get_NAICS_data(x))

explanable_X["ApprovalFY"] = explanable_X["ApprovalFY"].apply(lambda x : conv.get_ApprovalFY_data(x)) 
mean_dataframe = explanable_X[ explanable_X["ApprovalFY"] !=50]
mean_value = mean_dataframe["ApprovalFY"].mean()
explanable_X.loc[explanable_X['ApprovalFY'] == 50, 'ApprovalFY'] = math.ceil(mean_value)

explanable_X["NewExist"] = explanable_X["NewExist"].apply(lambda x : conv.get_NewExist_data(x)) 
explanable_X["FranchiseCode"] = explanable_X["FranchiseCode"].apply(lambda x : conv.get_FranchiseCode_data(x)) 

explanable_X = explanable_X.drop("UrbanRural", axis=1)

explanable_X["RevLineCr"] = explanable_X["RevLineCr"].apply(lambda x : conv.get_RevLineCr_data(x)) 
explanable_X["LowDoc"] = explanable_X["LowDoc"].apply(lambda x : conv.get_LowDoc_data(x)) 

explanable_X["GrAppv"] = explanable_X["GrAppv"].apply(lambda x : conv.get_GrAppv_value(x)) 
mean_dataframe = explanable_X[ explanable_X["GrAppv"] !=0]
mean_value = mean_dataframe["GrAppv"].mean()
explanable_X.loc[explanable_X['GrAppv'] == 0, 'ApprovalFY'] = math.ceil(mean_value)

explanable_X = explanable_X[ explanable_X["GrAppv"] !=0] 

explanable_X = explanable_X.drop("SBA_Appv", axis = 1)


explanable_X['MIS_Status'] = explanable_X.apply(lambda row: conv.predict_MIS_Status_data(row), axis=1 )
explanable_X["MIS_Status"] = explanable_X["MIS_Status"].apply(lambda x : conv.get_MIS_Status_data(x))

print (f"After : {explanable_X.shape[0]} lines")


Before : 899164 lines
After : 899164 lines


Ne supprimer les colonnes qu'après les conversions ( qui utilisent les colonnes supprimées )

In [3]:
explanable_X = explanable_X.drop('LoanNr_ChkDgt', axis=1)
explanable_X = explanable_X.drop('Name', axis=1)
explanable_X = explanable_X.drop('City', axis=1)
explanable_X = explanable_X.drop('Zip', axis=1)
explanable_X = explanable_X.drop('Bank', axis=1)
explanable_X = explanable_X.drop('BankState', axis=1)
explanable_X = explanable_X.drop('ApprovalDate', axis=1)
explanable_X = explanable_X.drop('ChgOffDate', axis=1) # (explicit end on simulation?) : included 
explanable_X = explanable_X.drop('DisbursementDate', axis=1)  
explanable_X = explanable_X.drop('DisbursementGross', axis=1)  
explanable_X = explanable_X.drop('BalanceGross', axis=1)  # 14 valeurs seulement différentes de zéro  ?
explanable_X = explanable_X.drop('ChgOffPrinGr', axis=1)

In [4]:
explanable_X["MIS_Status"].value_counts()

MIS_Status
P I F     741345
CHGOFF    157819
Name: count, dtype: int64

In [5]:
import numpy
import pandas as pd 

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder, LabelEncoder, LabelBinarizer, OneHotEncoder, Binarizer, FunctionTransformer, PolynomialFeatures,MinMaxScaler

def identity_operation(X) :
    return X

preprocessor = make_pipeline(  
    make_column_transformer(
        # LoanNr_ChkDgt, Name , City : excluded

        (FunctionTransformer(identity_operation, validate=False), ["State"]), 
        # Zip , Bank , BankState : excluded

        (FunctionTransformer(identity_operation, validate=False), ["NAICS"]), #included (2 first chars )
        
        # ApprovalDate : excluded
        (StandardScaler(), ["ApprovalFY"]),
        (StandardScaler(), ["Term"]), 
        (StandardScaler(), ["NoEmp"]),
        (StandardScaler(), ["NewExist"]),
        (StandardScaler(), ["CreateJob"]),
        (StandardScaler(), ["RetainedJob"]),
        (Binarizer(), ["FranchiseCode"]),
        #Binarizer("UrbanRural", threshold=1.5),

        (FunctionTransformer(identity_operation, validate=False), ["RevLineCr"]),
        (Binarizer(), ["LowDoc"]),
        # SimpleImputer("ChgOffDate"), # explicit end on simulation
        # DisbursementDate, DisbursementGross , BalanceGross ( 14 valeurs seulement différentes de zéro  ?)
        # MIS_Status : Excluded because it is Y !!!
        # ChgOffPrinGr : excluded
        (StandardScaler(), ["GrAppv"]), 
        #(StandardScaler(),["SBA_Appv"]),  
        remainder='passthrough'))

In [6]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from catboost import CatBoostClassifier, Pool
from scipy.stats import uniform, randint

# CatBoost prend en charge les données catégorielles nativement
categorical_features = ['State','NAICS', 'RevLineCr']

y = explanable_X['MIS_Status']
X = explanable_X.drop('MIS_Status', axis=1)

# Split en train et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Préparation des données pour CatBoost (en spécifiant les caractéristiques catégorielles)
# Convertir les données d'entraînement en un objet Pool pour indiquer les variables catégorielles (si nécessaire)
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)

test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)


In [7]:
#durée estimée : 10 min (50 fits)

# Initialize CatBoostClassifier
cat_boost_model = CatBoostClassifier(learning_rate=0.1, iterations=100, depth=6, cat_features=categorical_features)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', cat_boost_model)])

# Define the hyperparameters grid for RandomizedSearch
param_dist = {
    'iterations': randint(50, 200),       # Number of boosting iterations
    'depth': randint(4, 10),              # Depth of the trees
    'learning_rate': uniform(0.01, 0.3),  # Learning rate
    'l2_leaf_reg': uniform(1, 10),        # L2 regularization coefficient
    'border_count': randint(32, 255),     # Number of splits for categorical features
    'bagging_temperature': uniform(0, 1), # Controls the randomness in bagging
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(cat_boost_model, param_distributions=param_dist, 
                                   n_iter=10, scoring='roc_auc', 
                                   cv=5, verbose=1, n_jobs=-1)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Meilleurs paramètres trouvés : ", random_search.best_params_)
print("Meilleur score AUC : ", random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
0:	learn: 0.6681765	total: 686ms	remaining: 39.8s
0:	learn: 0.6681999	total: 710ms	remaining: 41.2s
1:	learn: 0.6450468	total: 1.52s	remaining: 43.4s
0:	learn: 0.6681649	total: 758ms	remaining: 43.9s
0:	learn: 0.5953779	total: 874ms	remaining: 1m 47s
0:	learn: 0.6253726	total: 1.02s	remaining: 2m 10s
1:	learn: 0.6453152	total: 1.5s	remaining: 42.7s
0:	learn: 0.6684251	total: 831ms	remaining: 48.2s
0:	learn: 0.6254863	total: 1.1s	remaining: 2m 19s
1:	learn: 0.6450779	total: 1.42s	remaining: 40.4s
2:	learn: 0.6235553	total: 2.3s	remaining: 42.9s
0:	learn: 0.6256145	total: 1.26s	remaining: 2m 40s
1:	learn: 0.5252051	total: 1.77s	remaining: 1m 48s
1:	learn: 0.6454863	total: 1.59s	remaining: 45.2s
2:	learn: 0.6238137	total: 2.48s	remaining: 46.4s
1:	learn: 0.5700391	total: 2.26s	remaining: 2m 22s
0:	learn: 0.6248693	total: 1.49s	remaining: 3m 9s
0:	learn: 0.6254167	total: 1.16s	remaining: 2m 26s
0:	learn: 0.6682224	total: 948ms	re



111:	learn: 0.1387260	total: 2m 13s	remaining: 0us
3:	learn: 0.5374357	total: 3.31s	remaining: 1m 58s
92:	learn: 0.1860882	total: 1m 40s	remaining: 58.2s
85:	learn: 0.1421238	total: 1m 45s	remaining: 31.8s
0:	learn: 0.6453260	total: 706ms	remaining: 1m 43s
109:	learn: 0.1391950	total: 2m 13s	remaining: 2.43s
40:	learn: 0.2322219	total: 46.7s	remaining: 2m
0:	learn: 0.6455475	total: 864ms	remaining: 2m 6s
104:	learn: 0.1401777	total: 2m 11s	remaining: 8.77s
1:	learn: 0.6041933	total: 1.51s	remaining: 1m 49s
93:	learn: 0.1858145	total: 1m 41s	remaining: 57.2s
4:	learn: 0.5092281	total: 4.55s	remaining: 2m 9s
86:	learn: 0.1419547	total: 1m 46s	remaining: 30.5s
110:	learn: 0.1390207	total: 2m 14s	remaining: 1.21s
1:	learn: 0.6043933	total: 1.64s	remaining: 1m 59s
41:	learn: 0.2305404	total: 47.8s	remaining: 1m 59s
105:	learn: 0.1400747	total: 2m 12s	remaining: 7.5s
2:	learn: 0.5685545	total: 2.31s	remaining: 1m 50s
94:	learn: 0.1854554	total: 1m 42s	remaining: 55.9s
5:	learn: 0.4846131	tot

In [None]:
#durée estimée : 40 min (200 fits), réelle : 

#Meilleur score AUC :  0.9771748465313973 avec one_hot_encoder
# param_dist = {
#     'iterations': [191],       # Number of boosting iterations
#     'depth': [7],              # Depth of the trees
#     'learning_rate': [0.2621537355139263],  # Learning rate
#     'l2_leaf_reg': [9.620670562880655],        # L2 regularization coefficient
#     'bagging_temperature': [0.09440670531484197], # Controls the randomness in bagging
# }

#Meilleur score AUC : 0.9773692063410346
param_dist = {
    'iterations': [173], 
    'depth': [8], 
    'learning_rate':[0.2628238092699549],
    'l2_leaf_reg': [4.642981495447009], 
    'bagging_temperature': [0.6728806036024608], 
    'border_count': [106]
}


# Initialize CatBoostClassifier
cat_boost_model = CatBoostClassifier(iterations = 150, learning_rate=0.2, depth=8, cat_features=categorical_features)

pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('classifier', cat_boost_model)])

# Utilisation dans un gridsearch

best_params_random = random_search.best_params_

param_grid = {
    'iterations': [171, 172, 173, 174, 175],
    'depth': [7, 8, 9],
    'learning_rate': [0.25, 0.26, 0.27],
    'l2_leaf_reg': [4.642981495447009],
    'border_count': [106],
    'bagging_temperature': [0.6728806036024608]
}

# Créer le GridSearchCV avec le param_grid
grid_search = GridSearchCV(estimator=cat_boost_model, param_grid=param_grid, 
                           scoring='roc_auc', cv=5, n_jobs=-1, verbose=1)

# Ajuster GridSearchCV
grid_search.fit(X_train, y_train)

# Afficher les meilleurs paramètres et le meilleur score
print("Meilleurs paramètres trouvés par GridSearch : ", grid_search.best_params_)
print("Meilleur score AUC de GridSearch : ", grid_search.best_score_)


Fitting 5 folds for each of 45 candidates, totalling 225 fits
0:	learn: 0.5442686	total: 663ms	remaining: 1m 52s
0:	learn: 0.5441383	total: 986ms	remaining: 2m 47s
0:	learn: 0.5450985	total: 1.07s	remaining: 3m 1s
0:	learn: 0.5390385	total: 741ms	remaining: 2m 5s
0:	learn: 0.5396055	total: 772ms	remaining: 2m 11s
1:	learn: 0.4535586	total: 1.67s	remaining: 2m 20s
0:	learn: 0.5446865	total: 1.27s	remaining: 3m 36s
0:	learn: 0.5447942	total: 1.34s	remaining: 3m 48s
0:	learn: 0.5391733	total: 1.2s	remaining: 3m 23s
0:	learn: 0.5400315	total: 1.23s	remaining: 3m 29s
1:	learn: 0.4470921	total: 1.6s	remaining: 2m 15s
1:	learn: 0.4475999	total: 1.7s	remaining: 2m 23s
1:	learn: 0.4534407	total: 2.23s	remaining: 3m 8s
2:	learn: 0.3919331	total: 2.62s	remaining: 2m 26s
1:	learn: 0.4537752	total: 2.31s	remaining: 3m 14s
1:	learn: 0.4541674	total: 1.97s	remaining: 2m 46s
0:	learn: 0.5397178	total: 1.56s	remaining: 4m 25s
1:	learn: 0.4539375	total: 2.25s	remaining: 3m 9s
0:	learn: 0.5341422	total: 



1:	learn: 0.4480258	total: 1.91s	remaining: 2m 42s
167:	learn: 0.1396963	total: 2m 52s	remaining: 5.12s
4:	learn: 0.3157102	total: 4.63s	remaining: 2m 35s
171:	learn: 0.1396227	total: 2m 59s	remaining: 1.04s
169:	learn: 0.1396643	total: 2m 56s	remaining: 3.12s
10:	learn: 0.2310740	total: 10s	remaining: 2m 27s
2:	learn: 0.3881871	total: 2.63s	remaining: 2m 28s
168:	learn: 0.1396528	total: 2m 52s	remaining: 4.09s
172:	learn: 0.1395972	total: 3m	remaining: 0us
5:	learn: 0.2908710	total: 5.62s	remaining: 2m 36s
170:	learn: 0.1396227	total: 2m 57s	remaining: 2.08s
0:	learn: 0.5397178	total: 1.05s	remaining: 3m 1s
11:	learn: 0.2255310	total: 10.8s	remaining: 2m 25s
3:	learn: 0.3479069	total: 3.68s	remaining: 2m 35s
169:	learn: 0.1396108	total: 2m 53s	remaining: 3.06s
0:	learn: 0.5340030	total: 1.07s	remaining: 3m 3s
6:	learn: 0.2749625	total: 6.35s	remaining: 2m 30s
171:	learn: 0.1395884	total: 2m 58s	remaining: 1.04s
12:	learn: 0.2191345	total: 11.7s	remaining: 2m 23s
1:	learn: 0.4478397	to

In [None]:
#%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

test_score = random_search.score(X_test, y_test)

# Prédictions sur le jeu de test
y_pred = random_search.predict(X_test)

# Évaluation du modèle
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Prédire sur l'ensemble de test
y_pred_proba = random_search.predict_proba(X_test)[:, 1]  # Probabilités pour la classe positive

# Calculer l'AUC sur l'ensemble de test
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC sur l'ensemble de test : ", auc)

# Calculer et afficher la matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)
print("Matrice de confusion :")
print(conf_matrix)

# Affichage avec Seaborn (optionnel)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Classe 0', 'Classe 1'], yticklabels=['Classe 0', 'Classe 1'])
plt.xlabel('Prédictions')
plt.ylabel('Vraies étiquettes')
plt.title('Matrice de Confusion')
plt.show()

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

# # Afficher la matrice de confusion
# print("Matrice de confusion :\n", conf_matrix)
# plt.figure(figsize=(6, 6))
# plot_confusion_matrix(random_search.best_estimator_, X_test, y_test, cmap=plt.cm.Blues)
# plt.title('Matrice de Confusion')
# plt.show()


In [None]:
from sklearn.metrics import f1_score

# y_test : vraies étiquettes
# y_pred : prédictions du modèle
macro_f1 = f1_score(y_test, y_pred, average='macro')

print("Macro F1 Score : ", macro_f1)