In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
import lightgbm as lgb
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, OneHotEncoder, StandardScaler, RobustScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score,make_scorer, accuracy_score, f1_score, recall_score, classification_report
import seaborn as sns

In [3]:
df = pd.read_csv('dataset.csv')
df['NewExist'] = df['NewExist'].fillna(0)

features_of_interest = [
    'State','cat_activites', 'UrbanRural','LowDoc','bank_loan_float','SBA_loan_float','FranchiseCode','BankState','RevLineCr','Term','ApprovalFY',
    'MIS_Status',
]

numerical_column = ['bank_loan_float','SBA_loan_float','Term'
    
]


ordinal_column = [
    'LowDoc'
]

categorical_column = [
    'State','cat_activites','FranchiseCode','BankState','RevLineCr',
]


target_name = "MIS_Status"
data, target, numerical_data, ordinal_data, categorical_data = (
    df[features_of_interest],
    df[target_name],
    df[numerical_column],
    df[ordinal_column],
    df[categorical_column]
)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data, target,train_size=0.9, random_state=42, stratify=data['MIS_Status'])
# Retirer la colonne 'MIS_Status' des ensembles X_train et X_test (car elle ne fait pas partie des features)
X_train = X_train.drop('MIS_Status', axis=1)
X_test = X_test.drop('MIS_Status', axis=1)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [5]:
preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("ordinal", OrdinalEncoder(), ordinal_column),
    ("numeric", StandardScaler(), numerical_column),
    ],
    remainder="passthrough",
)

In [15]:



clf = make_pipeline(preprocessor,lgb.LGBMClassifier())
clf.fit(X_train, y_train_encoded)



[LightGBM] [Info] Number of positive: 667210, number of negative: 142037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.294565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2150
[LightGBM] [Info] Number of data points in the train set: 809247, number of used features: 673
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.824483 -> initscore=1.547017
[LightGBM] [Info] Start training from score 1.547017


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [16]:
# Si c'est un problème binaire, prenez la probabilité pour la classe 1 (classe positive)
proba = clf.predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)
# Calcul du score AUC-ROC
roc_auc = roc_auc_score(y_test_encoded, proba)
print("AUC-ROC score:", roc_auc)
print('accuracy', clf.score(X_test,y_test_encoded))
print('rappel', recall_score(y_test_encoded,y_pred,pos_label=0))
print('f1_score',f1_score(y_test_encoded,y_pred, pos_label=0))
print(classification_report(y_test_encoded,y_pred))



AUC-ROC score: 0.9777583347815546




accuracy 0.9479075147080085
rappel 0.8154226333797998
f1_score 0.8460324764972718
              precision    recall  f1-score   support

           0       0.88      0.82      0.85     15782
           1       0.96      0.98      0.97     74135

    accuracy                           0.95     89917
   macro avg       0.92      0.90      0.91     89917
weighted avg       0.95      0.95      0.95     89917



In [19]:
clf[-1].get_features_out()

AttributeError: 'LGBMClassifier' object has no attribute 'get_features_out'

In [20]:
df_rf = pd.DataFrame(clf[0].get_feature_names_out(), clf[-1].feature_importances_)

In [26]:

df_rf[0].unique()
df_rf.filter(con)

array(['categorical__State_AK', 'categorical__State_AL',
       'categorical__State_AP', ..., 'numeric__Term',
       'remainder__UrbanRural', 'remainder__ApprovalFY'],
      shape=(2832,), dtype=object)

In [21]:
clf[-1].feature_importances_.sum()

np.int64(3000)

In [9]:
pd.DataFrame.from_dict({'features' : clf[-1].feature_names_in_, 'importances':clf[-1].feature_importances_}).sort_values(by = 'importances', ascending=False)

# plt.Figure(figsize=(25,10))
# sns.barplot(pd_clf, x = 'features',y = 'importances')
# plt.tight_layout()
# plt.show()


Unnamed: 0,features,importances
2830,Column_2830,584
2832,Column_2832,182
4014357,Column_4014357,169
4014355,Column_4014355,158
223477,Column_223477,80
...,...,...
164,Column_164,0
165,Column_165,0
4014358,Column_4014358,0
166,Column_166,0


In [10]:
y_pred = clf.predict(X_test)
y_pred



array([1, 0, 1, ..., 1, 1, 1], shape=(89917,))

In [14]:
import pickle
pickle.dump(clf,open('clf_lgbm.pkl','wb'))
