## Identification des facteurs influençant le turnover des employés 

# Import Data

In [43]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime

In [44]:
df = pd.read_csv('HRDataset_v14.csv')
df.head()

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,...,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30,Absences
0,"Adinolfi, Wilson K",10026,0,0,1,1,5,4,0,62506,...,Michael Albert,22.0,LinkedIn,Exceeds,4.6,5,0,1/17/2019,0,1
1,"Ait Sidi, Karthikeyan",10084,1,1,1,5,3,3,0,104437,...,Simon Roup,4.0,Indeed,Fully Meets,4.96,3,6,2/24/2016,0,17
2,"Akinkuolie, Sarah",10196,1,1,0,5,5,3,0,64955,...,Kissy Sullivan,20.0,LinkedIn,Fully Meets,3.02,3,0,5/15/2012,0,3
3,"Alagbe,Trina",10088,1,1,0,1,5,3,0,64991,...,Elijiah Gray,16.0,Indeed,Fully Meets,4.84,5,0,1/3/2019,0,15
4,"Anderson, Carol",10069,0,2,0,5,5,3,0,50825,...,Webster Butler,39.0,Google Search,Fully Meets,5.0,4,0,2/1/2016,0,2


In [45]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
EmpID,311.0,10156.0,89.922189,10001.0,10078.5,10156.0,10233.5,10311.0
MarriedID,311.0,0.398714,0.490423,0.0,0.0,0.0,1.0,1.0
MaritalStatusID,311.0,0.810289,0.943239,0.0,0.0,1.0,1.0,4.0
GenderID,311.0,0.434084,0.496435,0.0,0.0,0.0,1.0,1.0
EmpStatusID,311.0,2.392283,1.794383,1.0,1.0,1.0,5.0,5.0
DeptID,311.0,4.610932,1.083487,1.0,5.0,5.0,5.0,6.0
PerfScoreID,311.0,2.977492,0.587072,1.0,3.0,3.0,3.0,4.0
FromDiversityJobFairID,311.0,0.093248,0.291248,0.0,0.0,0.0,0.0,1.0
Salary,311.0,69020.684887,25156.63693,45046.0,55501.5,62810.0,72036.0,250000.0
Termd,311.0,0.334405,0.472542,0.0,0.0,0.0,1.0,1.0


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311 entries, 0 to 310
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Employee_Name               311 non-null    object 
 1   EmpID                       311 non-null    int64  
 2   MarriedID                   311 non-null    int64  
 3   MaritalStatusID             311 non-null    int64  
 4   GenderID                    311 non-null    int64  
 5   EmpStatusID                 311 non-null    int64  
 6   DeptID                      311 non-null    int64  
 7   PerfScoreID                 311 non-null    int64  
 8   FromDiversityJobFairID      311 non-null    int64  
 9   Salary                      311 non-null    int64  
 10  Termd                       311 non-null    int64  
 11  PositionID                  311 non-null    int64  
 12  Position                    311 non-null    object 
 13  State                       311 non

# Data Engennering

In [47]:
# Changer type de colonne
df['DateofHire'] = pd.to_datetime(df['DateofHire'])
df['DateofTermination'] = pd.to_datetime(df['DateofTermination'])
df['DOB'] = pd.to_datetime(df['DOB'])
df['LastPerformanceReview_Date'] = pd.to_datetime(df['LastPerformanceReview_Date'])

# calculer age a partire DOB
current_date = datetime.now()

# Calculer l'âge
df['Age'] = df['DOB'].apply(lambda x: current_date.year - x.year - ((current_date.month, current_date.day) < (x.month, x.day)))

In [48]:
df.columns

Index(['Employee_Name', 'EmpID', 'MarriedID', 'MaritalStatusID', 'GenderID',
       'EmpStatusID', 'DeptID', 'PerfScoreID', 'FromDiversityJobFairID',
       'Salary', 'Termd', 'PositionID', 'Position', 'State', 'Zip', 'DOB',
       'Sex', 'MaritalDesc', 'CitizenDesc', 'HispanicLatino', 'RaceDesc',
       'DateofHire', 'DateofTermination', 'TermReason', 'EmploymentStatus',
       'Department', 'ManagerName', 'ManagerID', 'RecruitmentSource',
       'PerformanceScore', 'EngagementSurvey', 'EmpSatisfaction',
       'SpecialProjectsCount', 'LastPerformanceReview_Date', 'DaysLateLast30',
       'Absences', 'Age'],
      dtype='object')

# Employee

## Emp par position


In [49]:
emp_position = pd.DataFrame(df.groupby('Position')['EmpID'].count().reset_index())
fig = px.bar(emp_position, x='Position', y='EmpID', title="Nombre d'employés par position")
fig.update_layout(xaxis_title='Position', yaxis_title="Nombre d'employés", xaxis_tickangle=-30)
fig.show()

## Emp par Departement

In [50]:
emp_Dept = pd.DataFrame(df.groupby('Department')['EmpID'].count().reset_index())
fig = px.treemap(emp_Dept, path=['Department'], values='EmpID', title="Nombre d'employés par département")
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()

## Emp par Manager

In [82]:
emp_manager = pd.DataFrame(df.groupby('ManagerName')['EmpID'].count().reset_index()).sort_values(by='EmpID', ascending=False)
fig = px.bar(emp_manager, x='ManagerName', y='EmpID', title="Nombre d'employés par manager")
fig.show()

## Emp par Gender

In [89]:
emp_sex = pd.DataFrame(df.groupby('Sex')['EmpID'].count().reset_index())
fig = px.pie(emp_sex, names='Sex', values='EmpID', title="Nombre d'employés par sexe")
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()

## Emp par MarriedStatus

In [76]:
# Married Status general
emp_married_general = pd.DataFrame(df.groupby('MarriedID')['EmpID'].count().reset_index())
fig = px.pie(emp_married_general, names='MarriedID', values='EmpID', title="Nombre d'employés par sexe")
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()

# Married Status en detail
emp_married_detail = pd.DataFrame(df.groupby('MaritalDesc')['EmpID'].count().reset_index())
fig = px.pie(emp_married_detail, names='MaritalDesc', values='EmpID', title="Nombre d'employés par sexe")
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()



# Satisfaction

## satisfaction par position

In [85]:
satsf_position = pd.DataFrame(df.groupby('Position')['EmpSatisfaction'].mean().reset_index())
fig = px.bar(satsf_position, x='Position', y='EmpSatisfaction', title="Satisfaction des employés par position")
fig.update_layout(xaxis_title='Position', yaxis_title="Satisfaction des employés", margin=dict(t=50, l=25, r=25, b=25))
fig.show()

## satisfaction par Departement

In [86]:
satsf_dept = pd.DataFrame(df.groupby('Department')['EmpSatisfaction'].mean().reset_index())
fig = px.bar(satsf_dept, x='Department', y='EmpSatisfaction', title="Satisfaction des employés par position")
fig.update_layout(xaxis_title='PositDepartmention', yaxis_title="Satisfaction des employés", margin=dict(t=50, l=25, r=25, b=25))
fig.show()

## satisfaction par gender

In [90]:
satsf_sex = pd.DataFrame(df.groupby('Sex')['EmpSatisfaction'].mean().reset_index())
fig = px.bar(satsf_sex, x='Sex', y='EmpSatisfaction', title="Satisfaction des employés par position")
fig.update_layout(xaxis_title='Sex', yaxis_title="Satisfaction des employés", margin=dict(t=50, l=25, r=25, b=25))
fig.show()

## satisfaction par Married

In [91]:
satsf_Marital = pd.DataFrame(df.groupby('MaritalDesc')['EmpSatisfaction'].mean().reset_index())
fig = px.bar(satsf_Marital, x='MaritalDesc', y='EmpSatisfaction', title="Satisfaction des employés par position")
fig.update_layout(xaxis_title='MaritalDesc', yaxis_title="Satisfaction des employés", margin=dict(t=50, l=25, r=25, b=25))
fig.show()

In [88]:
df.columns

Index(['Employee_Name', 'EmpID', 'MarriedID', 'MaritalStatusID', 'GenderID',
       'EmpStatusID', 'DeptID', 'PerfScoreID', 'FromDiversityJobFairID',
       'Salary', 'Termd', 'PositionID', 'Position', 'State', 'Zip', 'DOB',
       'Sex', 'MaritalDesc', 'CitizenDesc', 'HispanicLatino', 'RaceDesc',
       'DateofHire', 'DateofTermination', 'TermReason', 'EmploymentStatus',
       'Department', 'ManagerName', 'ManagerID', 'RecruitmentSource',
       'PerformanceScore', 'EngagementSurvey', 'EmpSatisfaction',
       'SpecialProjectsCount', 'LastPerformanceReview_Date', 'DaysLateLast30',
       'Absences', 'Age'],
      dtype='object')