In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures, OrdinalEncoder, OneHotEncoder, StandardScaler, RobustScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score,make_scorer, accuracy_score, f1_score, recall_score, classification_report

### LES FEATURES A GARDER SUITE A L'EDA

## Analyse du CHI2
State, NAICS(ou le cat_activites), UrbanRural, LowDoc, Bank,RevLineCr,FranchiseCode

## Analyse de l'ANOVA
DisbursementGross,bank_loan_float(float de GrAppv), SBA_loan_float(SBA_Appv)

In [2]:
df = pd.read_csv('dataset.csv')
pd.set_option('display.max_columns', None)
df['NewExist'] = df['NewExist'].fillna(0)


In [3]:
features_of_interest = [
    'State','cat_activites', 'UrbanRural','LowDoc','bank_loan_float','SBA_loan_float','FranchiseCode','crisis','BankState','Term','RevLineCr',
    'MIS_Status'
]

numerical_column = ['bank_loan_float','SBA_loan_float','Term'
    
]


ordinal_column = [
    'LowDoc'
]

categorical_column = [
    'State','cat_activites','FranchiseCode','BankState','RevLineCr'
]


target_name = "MIS_Status"
data, target, numerical_data, ordinal_data, categorical_data = (
    df[features_of_interest],
    df[target_name],
    df[numerical_column],
    df[ordinal_column],
    df[categorical_column]
)

In [4]:
data.isna().sum()

State                 0
cat_activites         0
UrbanRural            0
LowDoc                0
bank_loan_float       0
SBA_loan_float        0
FranchiseCode         0
crisis                0
BankState          1566
Term                  0
RevLineCr          4528
MIS_Status            0
dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data, target,train_size=0.9, random_state=42, stratify=data['MIS_Status'])
# Retirer la colonne 'MIS_Status' des ensembles X_train et X_test (car elle ne fait pas partie des features)
X_train = X_train.drop('MIS_Status', axis=1)
X_test = X_test.drop('MIS_Status', axis=1)


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

### Regression logistique

In [6]:
preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(handle_unknown='ignore'), categorical_column),
    ("ordinal", OrdinalEncoder(), ordinal_column),
    ("numeric", StandardScaler(), numerical_column),
    ],
    remainder="passthrough",
)

log_model = make_pipeline(preprocessor,LogisticRegression())
log_model.fit(X_train,y_train_encoded)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [8]:
X_train_2 = log_model.fit(X_train,y_train_encoded)
X_train_2

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [10]:
proba = log_model.predict_log_proba(X_train)
proba

array([[-2.85644942e+00, -5.91901388e-02],
       [-4.44395685e-01, -1.02502263e+00],
       [-1.95863686e+00, -1.52045220e-01],
       ...,
       [-3.85809322e+00, -2.13341735e-02],
       [-7.22548214e+00, -7.28066526e-04],
       [-3.53963806e+00, -2.94533529e-02]])

In [14]:
# Prédictions sur les scores de décision
decision_scores = log_model.decision_function(X_test)

# Calcul du score AUC-ROC
roc_auc = roc_auc_score(y_test, decision_scores)
# Si c'est un problème binaire, prenez la probabilité pour la classe 1 (classe positive)
y_pred = log_model.predict(X_test)
# Calcul du score AUC-ROC
roc_auc = roc_auc_score(y_test, decision_scores)
print("AUC-ROC score:", roc_auc)
print(log_model.score(X_test,y_test_encoded))
print(recall_score(y_test_encoded,y_pred))
print(f1_score(y_test_encoded,y_pred, pos_label=0))
print(classification_report(y_test_encoded,y_pred))

AUC-ROC score: 0.8533459844314168
0.8631626944849139
0.9700681189721454
0.4808000675162461
              precision    recall  f1-score   support

           0       0.72      0.36      0.48     15782
           1       0.88      0.97      0.92     74135

    accuracy                           0.86     89917
   macro avg       0.80      0.67      0.70     89917
weighted avg       0.85      0.86      0.84     89917

