In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import pickle

# Fonctions

In [None]:
def save_model(model, model_name):
    '''
    Permet de sauvegarder un model
    '''
    filename = f'./Streamlit/modeles/{model_name}.sav'
    pickle.dump(model, open(filename, 'wb'))
    
    print(f"Modèle {model_name} sauvegardé avec succès")

In [None]:
def save_info_model(liste_model, liste_names, X_test, y_test):
    '''
    Permet de sauvegarder les infos suivantes pour chaque modèle de "liste_model" :
     - les predictions
     - les valeurs réelles
     - la matrice de confusion
     - le rapport de classification
     - l'importance des variables si possible
    '''
    liste_save = []
    FEATURES_NAMES = [name for name in X_train]
    
    for index_name, model in enumerate(liste_model):
        predictions = model.predict(X_test)
        matrice_confusion = confusion_matrix(Y_test, predictions)
        rapport = classification_report(Y_test, predictions, output_dict=True)
        model_importance = []

        # Importance de la régression logistique
        try:
            coef = np.abs(model.coef_[0])
            model_importance = pd.Series(coef, index=FEATURES_NAMES).sort_values(ascending=False)
        except:
            pass

        try:
            IMPORTANCE = model.feature_importances_
            model_importance = pd.Series(IMPORTANCE, index=FEATURES_NAMES).sort_values(ascending=False)
        except:
            pass


        # Liste qui contiendra les predictions, les valeurs réelles, la matrice de confusion, le rapport de classification et 
        # l'importance des variables si possible
        liste_model = [liste_names[index_name], predictions, y_test, matrice_confusion, rapport, model_importance]
        liste_save.append(liste_model)
    
    with open("./Streamlit/modeles/modeles_info", "wb") as fp:
        pickle.dump(liste_save, fp)

In [3]:
data = pd.read_csv(r'./depart_employes.csv', sep=";")

In [4]:
data["target"] = data["depart"].astype('category')
data.drop("depart", axis=1,inplace=True)

In [5]:
col=list(data.columns)[:-1]

num_cols=list(data.columns)[:-3]
cat_cols=list(data.columns)[-3:-1]

# PRE-PROCESSING

#### Utilisation d'une pipeline pour faire le pre-traitement

In [6]:
pipeline = Pipeline([
    ('MinMaxScaler',MinMaxScaler()),
    ('StandardScaler',StandardScaler()),
    ('encoder_Service', LabelEncoder()),
    ('encoder_niveau_salaire', LabelEncoder())
])

In [None]:
df = pipeline['MinMaxScaler'].fit_transform(data[num_cols])
df=pd.DataFrame(df, columns=list(data[num_cols].columns))

df[cat_cols]=data[cat_cols]
df["target"]=data["target"]

In [None]:
Y = df["target"]
X = df.drop("target",axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, stratify = df.target, train_size=0.8)

In [None]:
standardScaler = pipeline['StandardScaler'].fit_transform(X_train[num_cols])
encoder_Service = pipeline['encoder_Service'].fit_transform(X_train['Service'])
encoder_niveau_salaire = pipeline['encoder_niveau_salaire'].fit_transform(X_train['niveau_salaire'])

#### Vérifier que les transformers ont bien été entrainés
Si les codes suivants produisent des erreurs, alors ils ne l'ont pas été

In [None]:
pipeline['encoder_Service'].classes_

In [None]:
pipeline['MinMaxScaler'].scale_

#### Enregistrement de la pipeline

In [None]:
import joblib
# joblib.dump(pipeline, f'./Streamlit/modeles/pipeline.pkl')

### Modèles

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree

model_KNN = KNeighborsClassifier(3)
model_KNN.fit(X_train, Y_train)

model_RL = LogisticRegression(random_state=0)
model_RL.fit(X_train, Y_train)

model_SVG = svm.SVC()
model_SVG.fit(X_train, Y_train)

model_TREE = tree.DecisionTreeClassifier()
model_TREE.fit(X_train, Y_train)

model_RF = RandomForestClassifier(max_depth=2, random_state=0)
model_RF.fit(X_train, Y_train)

### Sauvegarde des infos des modèles

In [None]:
model=[model_KNN, model_RL, model_SVG, model_TREE, model_RF]
model_names =["KNN_info","RL_info","SVG_info","TREE_info","RF_info"]

save_info_model(model, model_names, X_test, Y_test)

### Sauvegarde des modèles

In [None]:
save_model(model, 'modele')