In [79]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import TargetEncoder, OneHotEncoder, MinMaxScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline

In [80]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [81]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
import xgboost 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from sklearn import set_config

#class_weight={0:1, 1:12}
# Define the encoding strategy for features
categorical_features = X_train.select_dtypes(include='object').columns
numerical_features = X_train.select_dtypes(exclude="object").columns
n_unique_categories = X_train[categorical_features].nunique().sort_values(ascending=False)
high_cardinality_features = n_unique_categories[n_unique_categories > 15].index
low_cardinality_features = n_unique_categories[n_unique_categories <= 15].index

set_config(transform_output="pandas")

# Define the preprocessing steps for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

# Define the preprocessing steps for categorical features
high_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('targ', TargetEncoder()),
    ('scaler', MinMaxScaler())])

# Define the preprocessing steps for categorical features
low_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown="ignore"))])

# Use ColumnTransformer to apply the transformations to the correct columns in the dataframe
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('high_cat', high_categorical_transformer, high_cardinality_features),
        ('low_cat', low_categorical_transformer, low_cardinality_features)])


In [82]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

steps7 = [('preprocessor', preprocessor), ('over', SMOTE()), 
          ('model',  GradientBoostingClassifier(learning_rate= 0.1, n_estimators= 50, random_state=1))]
GradientBoosting_tuned = Pipeline(steps=steps7)

In [83]:
test = X_test.iloc[0]
test = test.to_dict()
#df = pd.DataFrame([test])
#df

In [92]:
import json
t = json.dumps(test)
print(t) 

{"NAME_CONTRACT_TYPE": "Cash loans", "CODE_GENDER": "F", "FLAG_OWN_CAR": "N", "FLAG_OWN_REALTY": "N", "CNT_CHILDREN": 2, "AMT_INCOME_TOTAL": 99000.0, "AMT_CREDIT": 299772.0, "AMT_ANNUITY": 19647.0, "AMT_GOODS_PRICE": 247500.0, "NAME_TYPE_SUITE": "Spouse, partner", "NAME_INCOME_TYPE": "Commercial associate", "NAME_EDUCATION_TYPE": "Secondary / secondary special", "NAME_FAMILY_STATUS": "Married", "NAME_HOUSING_TYPE": "House / apartment", "REGION_POPULATION_RELATIVE": 0.0228, "DAYS_BIRTH": -12715, "DAYS_EMPLOYED": -2695.0, "DAYS_REGISTRATION": -6624.0, "DAYS_ID_PUBLISH": -2431, "OWN_CAR_AGE": NaN, "FLAG_MOBIL": 1, "FLAG_EMP_PHONE": 1, "FLAG_WORK_PHONE": 1, "FLAG_CONT_MOBILE": 1, "FLAG_PHONE": 1, "FLAG_EMAIL": 0, "OCCUPATION_TYPE": "Laborers", "CNT_FAM_MEMBERS": 4.0, "REGION_RATING_CLIENT": 2, "REGION_RATING_CLIENT_W_CITY": 2, "WEEKDAY_APPR_PROCESS_START": "SUNDAY", "HOUR_APPR_PROCESS_START": 13, "REG_REGION_NOT_LIVE_REGION": 0, "REG_REGION_NOT_WORK_REGION": 0, "LIVE_REGION_NOT_WORK_REGION

In [88]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

def predict_bank(X, model):
    
    if type(X) == dict:
        df = pd.DataFrame([X])
    else:
        df = pd.DataFrame(X).T
    
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(df)[:, 1]
    y_pred = (y_proba > 0.25).astype("int")
    
    if y_pred == 0:
        print('Client solvable :', "la probabilité de faillite est de",y_proba*100, "%")
    elif y_pred == 1:
        print('Client à risque :', "la probabilité de faillite est de",y_proba*100, "%")
    
    return y_pred

In [89]:
predict_bank(test, GradientBoosting_tuned)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Client à risque : la probabilité de faillite est de [58.53210816] %


array([1])

###  Create Model File

In [86]:
import pickle

# Save to file in the current working directory
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(GradientBoosting_tuned, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)

In [87]:
##loading the model from the saved file
pkl_filename = "model.pkl"
with open(pkl_filename, 'rb') as f_in:
    model = pickle.load(f_in)

predictValue = predict_bank(test, model)
predictValue

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Client à risque : la probabilité de faillite est de [56.73408152] %
