# Notebook pour l'entrainement d'un modèle

## Importation des librairies

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
import utils

---

## Importation des données

In [2]:
df_sirh = pd.read_csv("../data/extrait_sirh.csv")
df_sondage = pd.read_csv("../data/extrait_sondage.csv")
df_eval = pd.read_csv("../data/extrait_eval.csv")

In [3]:
df_sirh.head()

Unnamed: 0,id_employee,age,genre,revenu_mensuel,statut_marital,departement,poste,nombre_experiences_precedentes,nombre_heures_travailless,annee_experience_totale,annees_dans_l_entreprise,annees_dans_le_poste_actuel
0,1,41,F,5993,Célibataire,Commercial,Cadre Commercial,8,80,8,6,4
1,2,49,M,5130,Marié(e),Consulting,Assistant de Direction,1,80,10,10,7
2,4,37,M,2090,Célibataire,Consulting,Consultant,6,80,7,0,0
3,5,33,F,2909,Marié(e),Consulting,Assistant de Direction,1,80,8,8,7
4,7,27,M,3468,Marié(e),Consulting,Consultant,9,80,6,2,2


In [4]:
df_sondage.head()

Unnamed: 0,a_quitte_l_entreprise,nombre_participation_pee,nb_formations_suivies,nombre_employee_sous_responsabilite,code_sondage,distance_domicile_travail,niveau_education,domaine_etude,ayant_enfants,frequence_deplacement,annees_depuis_la_derniere_promotion,annes_sous_responsable_actuel
0,Oui,0,0,1,1,1,2,Infra & Cloud,Y,Occasionnel,0,5
1,Non,1,3,1,2,8,1,Infra & Cloud,Y,Frequent,1,7
2,Oui,0,3,1,4,2,2,Autre,Y,Occasionnel,0,0
3,Non,0,3,1,5,3,4,Infra & Cloud,Y,Frequent,3,0
4,Non,1,3,1,7,2,1,Transformation Digitale,Y,Occasionnel,2,2


In [5]:
df_eval.head()

Unnamed: 0,satisfaction_employee_environnement,note_evaluation_precedente,niveau_hierarchique_poste,satisfaction_employee_nature_travail,satisfaction_employee_equipe,satisfaction_employee_equilibre_pro_perso,eval_number,note_evaluation_actuelle,heure_supplementaires,augementation_salaire_precedente
0,2,3,2,4,1,1,E_1,3,Oui,11 %
1,3,2,2,2,4,3,E_2,4,Non,23 %
2,4,2,1,3,2,3,E_4,3,Oui,15 %
3,4,3,1,3,3,3,E_5,3,Oui,11 %
4,1,3,1,2,4,3,E_7,3,Non,12 %


---

## Préparation de la data

---
### Préparation pour le merge des DF

In [6]:
df_eval['eval_number'] = utils.text_to_numeric(df_eval['eval_number'])
df_eval.dtypes

satisfaction_employee_environnement           int64
note_evaluation_precedente                    int64
niveau_hierarchique_poste                     int64
satisfaction_employee_nature_travail          int64
satisfaction_employee_equipe                  int64
satisfaction_employee_equilibre_pro_perso     int64
eval_number                                   int64
note_evaluation_actuelle                      int64
heure_supplementaires                        object
augementation_salaire_precedente             object
dtype: object

In [7]:
df_sirh.dtypes

id_employee                        int64
age                                int64
genre                             object
revenu_mensuel                     int64
statut_marital                    object
departement                       object
poste                             object
nombre_experiences_precedentes     int64
nombre_heures_travailless          int64
annee_experience_totale            int64
annees_dans_l_entreprise           int64
annees_dans_le_poste_actuel        int64
dtype: object

In [8]:
df_sondage.dtypes

a_quitte_l_entreprise                  object
nombre_participation_pee                int64
nb_formations_suivies                   int64
nombre_employee_sous_responsabilite     int64
code_sondage                            int64
distance_domicile_travail               int64
niveau_education                        int64
domaine_etude                          object
ayant_enfants                          object
frequence_deplacement                  object
annees_depuis_la_derniere_promotion     int64
annes_sous_responsable_actuel           int64
dtype: object

---
### Merge des dataframes

In [9]:
df_first_merge = pd.merge(df_sirh, df_sondage, how='outer', left_on='id_employee', right_on='code_sondage', indicator=True)
print(f"Il y a {len(df_first_merge['_merge'] == 'both')} correspondances sur {len(df_first_merge)}")
df_first_merge.drop(columns=['_merge'], inplace=True)

Il y a 1470 correspondances sur 1470


In [10]:
df_data = pd.merge(df_first_merge, df_eval, how='outer', left_on='id_employee', right_on='eval_number', indicator=True)
print(f"Il y a {len(df_data['_merge'] == 'both')} correspondances sur {len(df_data)}")
df_data.drop(columns=['_merge'], inplace=True)

Il y a 1470 correspondances sur 1470


In [12]:
df_data.head()

Unnamed: 0,id_employee,age,genre,revenu_mensuel,statut_marital,departement,poste,nombre_experiences_precedentes,nombre_heures_travailless,annee_experience_totale,...,satisfaction_employee_environnement,note_evaluation_precedente,niveau_hierarchique_poste,satisfaction_employee_nature_travail,satisfaction_employee_equipe,satisfaction_employee_equilibre_pro_perso,eval_number,note_evaluation_actuelle,heure_supplementaires,augementation_salaire_precedente
0,1,41,F,5993,Célibataire,Commercial,Cadre Commercial,8,80,8,...,2,3,2,4,1,1,1,3,Oui,11 %
1,2,49,M,5130,Marié(e),Consulting,Assistant de Direction,1,80,10,...,3,2,2,2,4,3,2,4,Non,23 %
2,4,37,M,2090,Célibataire,Consulting,Consultant,6,80,7,...,4,2,1,3,2,3,4,3,Oui,15 %
3,5,33,F,2909,Marié(e),Consulting,Assistant de Direction,1,80,8,...,4,3,1,3,3,3,5,3,Oui,11 %
4,7,27,M,3468,Marié(e),Consulting,Consultant,9,80,6,...,1,3,1,2,4,3,7,3,Non,12 %


---
### Encodage de la data

#### Modification du dataframe SIRH

In [None]:
useless_columns = [
    "id_employee",
    "eval_number",
    "code_sondage",
    "nombre_heures_travailless",
    "nombre_employee_sous_responsabilite",
    "ayant_enfants",
]

target_column = "a_quitte_l_entreprise"

category_columns = [
    "genre",
    "statut_marital",
    "departement",
    "poste",
    "domaine_etude",
    "heure_supplementaires",
]

ordinales_columns = [
    "frequence_deplacement",
]

ordinales_categories = [
    ["Aucun", "Occasionnel", "Frequent"],
]

numeric_columns = [
    "revenu_mensuel",
    "age",
    "nombre_experiences_precedentes",
    "annee_experience_totale",
    "annees_dans_l_entreprise",
    "annees_dans_le_poste_actuel",
    "nombre_participation_pee",
    "nb_formations_suivies",
    "distance_domicile_travail",
    "niveau_education",
    "annees_depuis_la_derniere_promotion",
    "annes_sous_responsable_actuel",
    "satisfaction_employee_environnement",
    "note_evaluation_precedente",
    "niveau_hierarchique_poste",
    "satisfaction_employee_nature_travail",
    "satisfaction_employee_equipe",
    "satisfaction_employee_equilibre_pro_perso",
    "note_evaluation_actuelle",
]

text_pipeline = Pipeline(
    [("clean", FunctionTransformer(utils.text_to_numeric)), ("scale", StandardScaler())]
)

ordinal_pipeline = Pipeline(
    [
        ("encode", OrdinalEncoder(categories=ordinales_categories)),
        ("scale", StandardScaler()),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        # OneHotEncoder, n-1 to avoid redundancy with drop first
        ("oneHot", OneHotEncoder(drop="first"), category_columns),
        # Managing all OrdinalEncoder
        (
            "ordinal",
            ordinal_pipeline,
            ordinales_columns,
        ),
        # Special pipeline for text to numeric
        ("textToNumeric", text_pipeline, ["augementation_salaire_precedente"]),
        # Standard Scaler
        ("scaler", StandardScaler(), numeric_columns),
    ]
)