# Apprentissage d’ensemble et forêts aléatoires
## Classification multi-classe des objets stellaires


In [15]:
# WARNINGS
import warnings
warnings.filterwarnings('ignore')

# NUMPY
import numpy as np

# SCIPY 
import scipy 
# STATS
import scipy.stats as stats
from scipy.stats import norm, skew

# MATPLOTLIB
import matplotlib as mlp
import matplotlib.pyplot as plt
%matplotlib inline 
# plt.style.use('fivethirtyeight') 

# PANDAS
import pandas as pd 
pd.set_option("display.max_rows", None, "display.max_columns", None) 

# SEABRON
import seaborn as sns

# PLOTLY 
import plotly.express as px
import plotly

# SCIKIT-LEARN: PRE-PROCESSING
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder # encodage des variables catégorielles ordinales
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder # encodage des variables catégorielles nominales
from sklearn.preprocessing import StandardScaler # standardisation des variables numériques
from sklearn.preprocessing import MinMaxScaler # normalisation des variables numériques
from sklearn.preprocessing import RobustScaler # normalisation des variables numériques
from sklearn.impute import SimpleImputer # imputation des valeurs manquantes
from sklearn.impute import KNNImputer # imputation des valeurs manquantes par la méthode KNN
from sklearn.feature_selection  import SelectKBest # sélectionner 
from sklearn. preprocessing import PolynomialFeatures 

# MODELES PREDICTIFS

## REGRESSION
from sklearn.linear_model import LogisticRegression # régréssion logistique

## SVM 
from sklearn.svm import LinearSVC # machines à vecteurs de support (linéaire)
from sklearn.svm import SVC # machines à vecteurs de support (non-linéaire)

## SGD
from sklearn.linear_model import SGDClassifier #  classifieurs (SVM, régression logistique, etc.) avec un algorithme SGD

## ARBRES, FORETS, APRRENTISSAGE D'ENSEMBLE
from sklearn.tree import DecisionTreeClassifier # arbres classification
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier 

# XGBoost
from xgboost import XGBClassifier, XGBRFClassifier
from xgboost import plot_tree, plot_importance

## KNN
from sklearn.neighbors import KNeighborsClassifier # KPP voisins

## YellowBrick
from yellowbrick.model_selection import LearningCurve
from yellowbrick.model_selection import ValidationCurve

# VALIDATION CROISEE + OPTIMISATION
from sklearn.model_selection import train_test_split # séparation des données en train et test set
from sklearn.model_selection import cross_val_score # validation croisée pour comparaison entre modèles
from sklearn.model_selection import validation_curve # courbe de validation: visulaisr les scores lors du choix d'un hyperparamétre
from sklearn.model_selection import GridSearchCV # tester plusieurs hyperparamètres
from sklearn.model_selection import RandomizedSearchCV # tester arbitrairement plusieurs hyperparamètres
from sklearn.model_selection import learning_curve # courbe d'apprentissage: visualisation les scores du train et du validation sets en fonction des quanitiés des données
 
## EVALUATION: METRIQUES DE CLASSIFICATION
from sklearn.metrics import accuracy_score # exactitude (accuracy)
from sklearn.metrics import f1_score # F1-score
from sklearn.metrics import confusion_matrix # matrice de confusion
# from sklearn.metrics import plot_confusion_matrix # graphique de la matrice de confusion
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report # rapport pour le modèle de classification
from sklearn.model_selection import LearningCurveDisplay
from sklearn.model_selection import cross_val_score
## EVALUATION: COURBE ROC
from sklearn.metrics import auc # aire sous la courbe 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import precision_recall_curve #
from sklearn.metrics import f1_score, recall_score


# PIPELINE
from sklearn.pipeline import make_pipeline

# TRANSFORMATEUR COMPOSITE (PRE-PROCESSOR + MODELE)
from sklearn.compose import make_column_transformer


In [16]:
# Import libraries and modules

# Import the necessary libraries
import sys
import os

# Add the project directory to the sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import everything from lib_import.py, data_exploring.py, data_preprocessing.py 
from lib.lib_import import *
from src.data_preprocessing import *
from src.model_evaluation import *

# Import the data
from data.data_extract import load_data

# Chargement des données

In [17]:
# Load the data

df_data = load_data()
df_data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [18]:
df = df_data.copy()

In [19]:
df = fix_target(df)

In [20]:
df.duplicated().sum()

48

In [21]:
df.drop_duplicates(inplace=True)

In [22]:
df_avec_inutile_columns = df.copy()

In [23]:
df_sans_inutile_columns = df.copy()

In [24]:
df_sans_inutile_columns = remove_inutile_column(df_sans_inutile_columns)

In [25]:
df_sans_inutile_columns.duplicated().sum()

6233

In [26]:
df_avec_inutile_columns.duplicated().sum()

0

In [27]:
df_sans_inutile_columns = regroupe_categories(df_sans_inutile_columns)
df_sans_inutile_columns.duplicated().sum()

6530

In [28]:
df_avec_inutile_columns = regroupe_categories(df_avec_inutile_columns)
df_avec_inutile_columns.duplicated().sum()

0

In [25]:
df_sans_remove_inutile_column = df_data.copy()

In [26]:
df_sans_remove_inutile_column = fix_target(df_sans_remove_inutile_column)

In [27]:
df_sans_remove_inutile_column = drop_outliers(df)

In [28]:
df_sans_remove_inutile_column.duplicated().sum()

6230

In [29]:
df_sans_remove_inutile_column.drop_duplicates(inplace=True)

In [30]:
df_sans_remove_inutile_column.duplicated().sum()

0

In [31]:
df_sans_remove_inutile_column = regroupe_categories(df_sans_remove_inutile_column)

In [32]:
df_sans_remove_inutile_column.duplicated().sum()

296

In [None]:
df = preprocess(df)
df.head(10)

In [None]:
df.shape

In [None]:
df_train, df_test = seperate_train_test(df, random_state=42)

In [None]:
cat_features = get_cat_features(df)
cont_features = get_cont_features(df)

In [None]:
df_train,df_test = impute_missing_cat_values(df_train, df_test,cat_features,strategy='most_frequent')


In [None]:
df_train,df_test = standardize(df_train, df_test, cont_features)

In [None]:
df_train,df_test = encode_cat_features_onehot(df_train, df_test, cat_features)

In [None]:
df_train.shape

# Modélisation prédictive

In [None]:
X_train = df_train.values
X_test = df_test.values

y_train = df_train['>50K'].values
y_test = df_test['>50K'].values

In [None]:
df_test.shape

In [None]:
df_train.shape

## Arbres de décision

In [None]:
tree_descision =  DecisionTreeClassifier(random_state=99)

In [None]:
## Régression logistique
tree_descision.fit(X_train, y_train)
y_pred = tree_descision.predict(X_test)


plot_confusion_matrix_sns(y_test, y_pred, "Arbre de decision")
N, train_score, val_score = learning_curve(tree_descision, X_train, y_train, 
                                           cv=5, scoring='f1',
                                           train_sizes=np.linspace(0.1, 1, 10))

plt.figure(figsize=(12,8))
plt.plot(N, train_score.mean(axis=1), label='train score')
plt.plot(N, val_score.mean(axis=1), label='validation score')
plt.legend()
plt.title("Courbe de validation pour la régression logistique")
plt.ylabel('score')
plt.xlabel('Train size ' r'$\lambda = \frac{1}{C}$')
plt.show()

In [None]:
## TreeClassifierDecision
# class sklearn.tree.DecisionTreeClassifier(
# *, criterion='gini', splitter='best', max_depth=None, min_samples_split=2,
# min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, monotonic_cst=None)[source]#

In [None]:
tree_max_depth_10 = DecisionTreeClassifier(max_depth=10, random_state=99)
tree_max_depth_10_pipe = make_pipeline( tree_max_depth_10)
evaluation_learning_curve(tree_max_depth_10_pipe, "decisiontreeclassifier", df_train, y_train, score='f1_micro', cv=5)

In [None]:
# Eval val curve

def evaluation_validation_curve(model_pipe, model_name, df_train, y_train, param_name, param_range, score, cv):
    model_pipe.fit(df_train, y_train)
    # y_pred = model_pipe.predict(df_test)
    # fig = px.imshow(confusion_matrix(y_test, y_pred),  text_auto=True, #'.2f', 
                    # aspect='equal', width=400, height=400,
                    # title = 'Matrice de confusion ' + model_name)

    # fig.update_layout(
    # margin=dict(l=20, r=20, t=20, b=20),
    # paper_bgcolor="LightSteelBlue",
# )
    # fig.show()
  
    X_train = model_pipe.named_steps["columntransformer"].fit_transform(df_train)
   
    model = model_pipe.named_steps[model_name]

    val_curve = ValidationCurve(model_pipe.named_steps[model_name], param_name=param_name, param_range=param_range, cv=cv, scoring=score)
    # visualizer = LearningCurve(
    # model_pipe.named_steps[model_name], cv=cv, scoring=score, train_sizes=train_sizes, 
# )

    
    val_curve.fit(X_train, y_train)
    val_curve.show()

In [None]:
tree_max_depth = DecisionTreeClassifier(max_depth=10, random_state=99)

In [None]:
evaluation_validation_curve(
    model_pipe=tree_descision,
    model_name="decisiontreeclassifier",
    df_train=df_train,
    y_train=y_train,
    param_name="max_depth",
    param_range=[3, 5, 8, 11, 12], 
    score='f1_micro',
    cv=5,
)

In [None]:
evaluation_validation_curve(
    model_pipe=tree_descision,
    model_name="decisiontreeclassifier",
    df_train=df_train,
    y_train=y_train,
    param_name="min_samples_leaf",
    param_range=[2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32], 
    score='f1_micro',
    cv=5,
)

#  Bagging : Forêts Aléatoires

In [None]:
random_forest = make_pipeline(preprocessor, RandomForestClassifier(random_state=99))
random_forest

In [None]:
print(random_forest)

In [None]:
feature_importances = evaluation_learning_curve(random_forest, "randomforestclassifier", df_train, y_train, score='f1_micro', cv=5)

In [None]:
evaluation_validation_curve(
    model_pipe=random_forest,
    model_name="randomforestclassifier",
    df_train=df_train,
    y_train=y_train,
    param_name="min_samples_leaf",
    param_range=[2, 4, 6, 8, 10, 12, 14, 16, 20, 24, 28, 32], 
    score='f1_micro',
    cv=5,
)

## Importance des features

In [None]:
feature_importances

In [None]:
# on va injecter ce tableau dans un dataframe
df_feature_importances = pd.DataFrame(feature_importances, index=df_train.columns)
df_feature_importances.T

In [None]:
plt.rcParams["figure.figsize"] = (10,2)
N = len(df_train.columns)
score_importances = (df_feature_importances.values.reshape(1,-1)[0])
ind = np.arange(N)    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

plt.bar(ind, score_importances, width)
plt.ylabel('Scores')
plt.title('Score d importance par feature')
plt.xticks(ind, list(df_train.columns.values))
plt.show()

## Erreur de classification

In [None]:
random_forest_clf = make_pipeline(preprocessor, RandomForestClassifier(random_state=99))
random_forest_clf.fit(df_train, y_train)
preds = random_forest_clf.predict(df_test)
score = random_forest_clf.score(df_test, y_test)

In [None]:
score

In [None]:
df_train.head()

In [None]:
# classes = {'GALAXY':1,'STAR':2,'QSO':3}
plt.rcParams["figure.figsize"] = (6,4)
from yellowbrick.classifier import ClassPredictionError
visualizer = ClassPredictionError(random_forest_clf.named_steps['randomforestclassifier']) #, classes=classes)
# set_palette('pastel')

X_train = random_forest_clf.named_steps["columntransformer"].fit_transform(df_train)
X_test = random_forest_clf.named_steps["columntransformer"].fit_transform(df_test)

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show();

In [None]:
from yellowbrick.classifier import ROCAUC
rf_clf = random_forest_clf.named_steps['randomforestclassifier']
visualizer = ROCAUC(rf_clf) #, classes = classes)
# set_palette('bold')

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()

## Optimisation : recherche aléatoire

In [None]:
from pprint import pprint
rf_clf = RandomForestClassifier(random_state=99)

print('Parameters en cours dutilisation:\n')
pprint(rf_clf.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
%%time

# Nombre d'arbres dans la forêt aléatoire
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Nombre de features à prendre en compte à chaque séparation
max_features = ['auto', 'sqrt']
# Profondeur des arbres
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Nombre minimum d'échantillons requis pour séparer un nœud
min_samples_split = [2, 5, 10]
# Nombre minimum d'échantillons requis à chaque nœud terminal
min_samples_leaf = [1, 2, 4]
# Méthode de sélection des échantillons pour l'entraînement de chaque arbre
bootstrap = [True, False]

# Créer la grille aléatoire
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

In [None]:
%%time
# CPU times: user 12min 20s, sys: 1.14 s, total: 12min 21s
# Wall time: 1h 10min 40s
rf_clf_random = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=random_grid,
    n_iter=30,
    cv=5,
    verbose=2,
    random_state=99,
    n_jobs = -1)
rf_clf_random.fit(X_train, y_train)

In [None]:
rf_clf_random.best_params_

In [None]:
best_rf_clf_random = rf_clf_random.best_estimator_

In [None]:
best_rf_clf_preds = best_rf_clf_random.predict(X_test)

In [None]:
from yellowbrick.classifier import ROCAUC
rf_clf = random_forest_clf.named_steps['randomforestclassifier']
visualizer = ROCAUC(best_rf_clf_random) #, classes = classes)
# set_palette('bold')

visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()

# Boosting : AdaBoost

In [None]:
adabost = make_pipeline(preprocessor, AdaBoostClassifier(DecisionTreeClassifier(random_state=99)))
adabost

In [None]:
print(adabost)

In [None]:
feature_importances_adaboost = evaluation_learning_curve(adabost, "adaboostclassifier", df_train, y_train, score='f1_micro', cv=5)

## Score d'importance des features

In [None]:
plt.rcParams["figure.figsize"] = (10,2)
df_feature_importances_adabost = pd.DataFrame(feature_importances_adaboost, index=df_train.columns)
df_feature_importances_adabost .T
N = len(df_train.columns)
score_importances_adaboost = (df_feature_importances_adabost .values.reshape(1,-1)[0])
ind = np.arange(N)    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

plt.bar(ind, score_importances_adaboost, width)
plt.ylabel('Scores')
plt.title('Score d importance par feature')
plt.xticks(ind, list(df_train.columns.values))
plt.show()

## Optimisation

Les hyper-paramètres d'adaboost : 

- **base_estimator** : Ce paramètre est utilisé pour indiquer le type des algorithmes faibles "weak learner". Il peut s'agir d'un arbre de décision, d'une régression logistique, d'un SVC, etc. Par défaut, l'estimateur de base est DecisionTreeClassifier(max_depth=1).

- **n_estimators** : Le nombre d'estimateurs de base ou d'apprenants faibles que nous voulons utiliser dans notre ensemble de données. Par défaut, la valeur de n_estimator est de 50.

- **learning_rate** (taux d'apprentissage) : Ce paramètre est fourni pour réduire la contribution de chaque classificateur. Par défaut, il a une valeur de 1.

- **algorithm** : Il peut s'agir de SAMME ou de SAMME.R. 

In [None]:
def evaluation_model(model, X, y, score='f1_micro', cv=5):
    scores = cross_val_score(model, X, y, scoring=score, cv=cv)
    return scores


def _get_models_by_n_estimators():
    models = {}
    n_estimators = [10, 20, 30, 40, 50, 500]
    for n in n_estimators:
        models[str(n)] = AdaBoostClassifier(n_estimators=n, random_state=99)
    return models

def _get_models_by_learning_rate():
	models = dict()
	for i in np.arange(0.1, 2.1, 0.1):
		key = '%.3f' % i
		models[key] = AdaBoostClassifier(learning_rate=i)
	return models


In [None]:
results, names = list(), list()
models = _get_models_by_n_estimators()
for name, model in models.items():
    scores = evaluation_model(model, X=X_train, y=y_train)
    results.append(scores)
    names.append(name)

In [None]:
models.keys()

In [None]:
df_scores_adaboost_n_estimators = pd.DataFrame(results, index=models.keys())
df_scores_adaboost_n_estimators.T

In [None]:
df_scores_adaboost_n_estimators.columns

In [None]:
fig = px.box(df_scores_adaboost_n_estimators, x=df_scores_adaboost_n_estimators.columns) #, y="total_bill", color="smoker")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()

In [None]:
plt.boxplot(results, labels=names, showmeans=True)
plt.show()

In [None]:
%%time
results_eta_learning, names_eta_learning = list(), list()
models_eta_learning = _get_models_by_learning_rate()
for name, model in models_eta_learning.items():
    scores_eta_learning = evaluation_model(model, X=X_train, y=y_train)
    results_eta_learning.append(scores_eta_learning)
    names.append(name)

In [None]:
df_results_eta_learning = pd.DataFrame(results_eta_learning, index=models_eta_learning.keys())
df_results_eta_learning.T

In [None]:
list(models_eta_learning.keys())

In [None]:
fig = px.box(df_results_eta_learning, x=df_results_eta_learning.columns) #, y="total_bill", color="smoker")
fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()

# Gradient Boosting

In [None]:
# Initialisation du modèle XGBoost
xgb_pipe = make_pipeline(preprocessor, XGBClassifier(random_state=99))
xgb_pipe

In [None]:
print(xgb_pipe)

In [None]:
# xgb_clf = xgb.XGBClassifier(random_state=99)
# xgb_clf.fit(X_train, y_train_xgb)
# ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got [1 2 3]
# On doit changer l'encodage de y de [1, 2, 3] à [0, 1, 2]

In [None]:
class_map = {'GALAXY':0,'STAR':1,'QSO':2}

y_train_xgb = data_stellar['class'].map(class_map)
y_train_xgb = y_train_xgb.values

y_test_xgb = data_stellar['class'].map(class_map)
y_test_xgb = y_test_xgb.values

In [None]:
# xgb_clf = xgb.XGBClassifier(random_state=99)
# xgb_clf.fit(X_train, y_train_xgb)

In [None]:
data_train, data_test = train_test_split(data_stellar, test_size = 0.2, stratify=data_stellar["class"], random_state=99)
y_train_xgb = data_train['class']
y_test_xgb = data_test['class']

In [None]:
y_train_xgb

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_xgb = le.fit_transform(y_train_xgb)
xgb_clf = xgb.XGBClassifier(random_state=99)
xgb_clf.fit(X_train, y_train_xgb)

In [None]:
y_test_xgb

In [None]:
y_test_xgb = le.fit_transform(y_test_xgb)

In [None]:
y_pred_xgb = xgb_clf.predict(X_test)
y_pred_xgb 

In [None]:
accuracy_score(y_test_xgb, y_pred_xgb)

In [None]:
cm = confusion_matrix(y_test_xgb, y_pred_xgb, labels=xgb_clf.classes_)
fig, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(cm,annot=True,fmt="",linewidth=.5, cmap="mako",xticklabels=xgb_clf.classes_, yticklabels=xgb_clf.classes_)
ax.set(xlabel="Predicted", ylabel="True")
ax.xaxis.tick_top()
plt.yticks(rotation=0)
plt.show()

In [None]:
## Importance des features
# Ref : https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.plotting

#importance_type (str, default "weight") – How the importance is calculated: either “weight”, “gain”, or “cover”

# ”weight” is the number of times a feature appears in a tree
# ”gain” is the average gain of splits which use the feature
# ”cover” is the average coverage of splits which use the feature where coverage is defined as the number of samples affected by the split

In [None]:
plt.rcParams['figure.figsize'] = (10.0, 8)
xgb.plot_importance(xgb_clf)

In [None]:
xgb.plot_importance(xgb_clf, importance_type="gain")

In [None]:
## Trçage d'un arbre 
# Demande l(installation ddu package 'graphviz'
# pip install graphviz

In [None]:
plt.rcParams['figure.figsize'] = (20.0, 8)

# Tracer le premier arbre dans XGBoost
xgb.plot_tree(xgb_clf, num_trees=0)

## Optimisation

In [None]:
rs_param_grid = {
    'max_depth': list((range(3,12))),
    'alpha': [0,0.001, 0.01,0.1,1],
    'subsample': [0.5,0.75,1],
    'learning_rate': np.linspace(0.01,0.5, 10),
    'n_estimators': [10, 25, 40]
    }


xgb_clf = xgb.XGBClassifier(random_state=99)

xgb_rs = RandomizedSearchCV(estimator=xgb_clf,param_distributions=rs_param_grid, 
                                cv=5, verbose=2, random_state=99, scoring='f1_micro')

xgb_rs.fit(X_train, y_train_xgb)

print("Meilleurs paramètres trouvés: ", xgb_rs.best_params_)
print("Meilleure performance: ", xgb_rs.best_score_)

In [None]:
from yellowbrick.classifier import ROCAUC
# rf_clf = random_forest_clf.named_steps['randomforestclassifier']
visualizer = ROCAUC(xgb_rs.best_estimator_) #, classes = classes)
# set_palette('bold')

visualizer.fit(X_train,  y_train_xgb)
visualizer.score(X_test, y_test_xgb)
visualizer.show()

# Stacking

In [None]:
# Définition des prédicteurs (estimators)
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


dtc =  DecisionTreeClassifier()
rfc = RandomForestClassifier()
knn =  KNeighborsClassifier()
xgb = XGBClassifier()

estimator_list = [
    ('dtc', dtc ),
    ('rfc ', rfc ),
    ('knn ',knn ),
    ('xgb',xgb),
 ]

# Construire un modèle de pile
stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)


In [None]:
# Training
stack_model.fit(X_train, y_train_xgb)

In [None]:

# Calcul des prédictions
y_test_pred = stack_model.predict(X_test)

In [None]:
accuracy_score(y_test_xgb, y_pred_xgb)