In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, OneHotEncoder, StandardScaler, RobustScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score,make_scorer, accuracy_score, f1_score, recall_score, classification_report

### LES FEATURES A GARDER SUITE A L'EDA

## Analyse du CHI2
State, NAICS(ou le cat_activites), UrbanRural, LowDoc, Bank,RevLineCr,FranchiseCode

## Analyse de l'ANOVA
DisbursementGross,bank_loan_float(float de GrAppv), SBA_loan_float(SBA_Appv)

In [3]:
df = pd.read_csv('dataset.csv')
df['NewExist'] = df['NewExist'].fillna(0)
df.columns

Index(['LoanNr_ChkDgt', 'Name', 'City', 'State', 'Zip', 'Bank', 'BankState',
       'NAICS', 'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist',
       'CreateJob', 'RetainedJob', 'FranchiseCode', 'UrbanRural', 'RevLineCr',
       'LowDoc', 'ChgOffDate', 'DisbursementDate', 'DisbursementGross',
       'BalanceGross', 'MIS_Status', 'ChgOffPrinGr', 'GrAppv', 'SBA_Appv',
       'cat_activites', 'SBA_loan_float', 'bank_loan_float', 'crisis'],
      dtype='object')

In [4]:
features_of_interest = [
    'State','cat_activites', 'UrbanRural','LowDoc','bank_loan_float','SBA_loan_float','FranchiseCode','crisis','BankState','Term','RevLineCr',
    'MIS_Status'
]

numerical_column = ['bank_loan_float','SBA_loan_float','Term'
    
]


ordinal_column = [
    'LowDoc'
]

categorical_column = [
    'State','cat_activites','FranchiseCode','BankState','RevLineCr'
]


target_name = "MIS_Status"
data, target, numerical_data, ordinal_data, categorical_data = (
    df[features_of_interest],
    df[target_name],
    df[numerical_column],
    df[ordinal_column],
    df[categorical_column]
)

In [5]:
data.isna().sum()

State                 0
cat_activites         0
UrbanRural            0
LowDoc                0
bank_loan_float       0
SBA_loan_float        0
FranchiseCode         0
crisis                0
BankState          1566
Term                  0
RevLineCr          4528
MIS_Status            0
dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, target,train_size=0.9, random_state=42, stratify=data['MIS_Status'])
# Retirer la colonne 'MIS_Status' des ensembles X_train et X_test (car elle ne fait pas partie des features)
X_train = X_train.drop('MIS_Status', axis=1)
X_test = X_test.drop('MIS_Status', axis=1)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

### Regression logistique

In [7]:
preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("ordinal", OrdinalEncoder(), ordinal_column),
    ("numeric", StandardScaler(), numerical_column),
    ],
    remainder="passthrough",
)

log_model = make_pipeline(preprocessor,LogisticRegression())
log_model.fit(X_train,y_train_encoded)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [8]:
log_model.predict_log_proba

<bound method Pipeline.predict_log_proba of Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['State', 'cat_activites',
                                                   'FranchiseCode', 'BankState',
                                                   'RevLineCr']),
                                                 ('ordinal', OrdinalEncoder(),
                                                  ['LowDoc']),
                                                 ('numeric', StandardScaler(),
                                                  ['bank_loan_float',
                                                   'SBA_loan_float',
                                                   'Term'])])),
                ('logisticregression', LogisticRegressi

In [9]:
# Prédictions sur les scores de décision
decision_scores = log_model.decision_function(X_test)

# Calcul du score AUC-ROC
roc_auc = roc_auc_score(y_test, decision_scores)
# Si c'est un problème binaire, prenez la probabilité pour la classe 1 (classe positive)
y_pred = log_model.predict(X_test)
# Calcul du score AUC-ROC
roc_auc = roc_auc_score(y_test, decision_scores)
print("AUC-ROC score:", roc_auc)
print(log_model.score(X_test,y_test_encoded))
print(recall_score(y_test_encoded,y_pred))
print(f1_score(y_test_encoded,y_pred, pos_label=0))
print(classification_report(y_test_encoded,y_pred))

AUC-ROC score: 0.8534423016431549
0.863040359442597
0.9706886086194105
0.4780673871582963
              precision    recall  f1-score   support

           0       0.72      0.36      0.48     15782
           1       0.88      0.97      0.92     74135

    accuracy                           0.86     89917
   macro avg       0.80      0.66      0.70     89917
weighted avg       0.85      0.86      0.84     89917



In [10]:
df['Zip'].unique()

array([47711, 46526, 47401, ..., 70036, 66549, 26134], shape=(33611,))

### RANDOM FOREST GUMP

In [11]:
features_of_interest = [
    'State','cat_activites', 'UrbanRural','LowDoc','bank_loan_float','SBA_loan_float','FranchiseCode', 'crisis','Bank','MIS_Status'
]

numerical_column = ['bank_loan_float','SBA_loan_float'
]


ordinal_column = [
    'LowDoc'
]

categorical_column = [
    'State','cat_activites','FranchiseCode', 'Bank'
]


target_name = "MIS_Status"
data, target, numerical_data, ordinal_data, categorical_data = (
    df[features_of_interest],
    df[target_name],
    df[numerical_column],
    df[ordinal_column],
    df[categorical_column]
)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data, target,train_size=0.9, random_state=42, stratify=data['MIS_Status'])
# Retirer la colonne 'MIS_Status' des ensembles X_train et X_test (car elle ne fait pas partie des features)
X_train = X_train.drop('MIS_Status', axis=1)
X_test = X_test.drop('MIS_Status', axis=1)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [None]:
preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("ordinal", OrdinalEncoder(), ordinal_column),
    ("numeric", RobustScaler(), numerical_column),
    ],
    remainder="passthrough",
)

rf_model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=10))
rf_model.fit(X_train,y_train_encoded)

In [None]:
# Si c'est un problème binaire, prenez la probabilité pour la classe 1 (classe positive)
proba = rf_model.predict_proba(X_test)[:, 0]
y_pred = rf_model.predict(X_test)
# Calcul du score AUC-ROC
roc_auc = roc_auc_score(y_test, proba)
print("AUC-ROC score:", roc_auc)
print(rf_model.score(X_test,y_test_encoded))
print(recall_score(y_test_encoded,y_pred))
print(f1_score(y_test_encoded,y_pred))
print(classification_report(y_test_encoded,y_pred))

NameError: name 'rf_model' is not defined

In [None]:
rf_model[0].get_feature_names_out()

array(['categorical__State_AK', 'categorical__State_AL',
       'categorical__State_AP', ..., 'numeric__SBA_loan_float',
       'remainder__UrbanRural', 'remainder__crisis'],
      shape=(8455,), dtype=object)

In [None]:
np.sort(rf_model[-1].feature_importances_)

array([0.        , 0.        , 0.        , ..., 0.03841056, 0.18376524,
       0.20575449], shape=(8455,))

In [None]:
df_rf = pd.DataFrame(rf_model[0].get_feature_names_out(), rf_model[-1].feature_importances_)

In [None]:
df_rf_ri = df_rf[0].reset_index()
df_rf_ri[0]

0          categorical__State_AK
1          categorical__State_AL
2          categorical__State_AP
3          categorical__State_AR
4          categorical__State_AZ
                  ...           
8450             ordinal__LowDoc
8451    numeric__bank_loan_float
8452     numeric__SBA_loan_float
8453       remainder__UrbanRural
8454           remainder__crisis
Name: 0, Length: 8455, dtype: object

In [None]:
def contient_state(table):
    str_cherche = 'State'
    contient_str = [x for x in df.columns if str_cherche in x]
    return contient_str

In [None]:
test_cs = contient_state(df_rf_ri[0])
test_cs

['State', 'BankState']