# Employee Attrition


### Data set used in this analysis
<p>HR dataset named ‘IBM HR Analytics Employee Attrition & Performance’ </p>


### Importing libraries for data handling and analysis


In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
from scipy.stats import norm, skew
from scipy import stats
import statsmodels.api as sm

### Importing libraries for data visualisations


In [None]:
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib
%matplotlib inline
color = sns.color_palette()
from IPython.display import display
pd.options.display.max_columns = None

### Explain the functionalities of the above libraries below 

## Loading the dataset

In [None]:
import zipfile
 
# open zipped dataset
with zipfile.ZipFile("attrition.zip") as z:
   # open the csv file in the dataset
   with z.open("WA_Fn-UseC_-HR-Employee-Attrition.csv") as f:
       
      # read the dataset
      attrition = pd.read_csv(f)
       
attrition.head()

Information on the dataset can be found here:

http://inseaddataanalytics.github.io/INSEADAnalytics/groupprojects/January2018FBL/IBM_Attrition_VSS.html

In [None]:
#Exploring and understanding the dataset
# Write code below

### Data Cleaning and Transformation

In [None]:
#checking for duplicate row entries in the data
# Write code below

In [None]:
#Computing number of unique values in the fields (columns) of the dataset
# Write code below

In [None]:
#Checking for redundant columns
# Write code below

In [None]:
#Removing redundant columns
# Write code below

For conversion to Categorical type, we only want to use non-numeric columns

In [None]:
#For "dirtier" version of the dataset, this would have been needed
#attrition['MaritalStatus']= attrition['MaritalStatus'].replace('Divorcedddd','Divorced')
#attrition['MaritalStatus']= attrition['MaritalStatus'].replace('Divorceddd','Divorced')

In [None]:
#Checking for columns to convert to Categorical
# Write code below

In [None]:
# Convert appropriate columns to categories. 
# Write code below

### Exploratory Data Analysis

Analyzing outliers

In [None]:
# Are there any outliers in the Age variable?
plt.boxplot(attrition['Age'])

In [None]:
# Are there any outliers in the TotalWorkingYears variable?
plt.boxplot(attrition['TotalWorkingYears'])

In [None]:
# Describe the function below
def outliers(df, column):
    lower_quart=df[column].quantile(0.25)
    upper_quart=df[column].quantile(0.75)
    IQR=upper_quart-lower_quart
    threshold=1.5*IQR
    return df[df[column].between((lower_quart-threshold),(upper_quart+threshold))]



In [None]:
#Let us just keep records that are not outliers
attrition=outliers(attrition,'TotalWorkingYears')

In [None]:
attrition.shape

#### Correlation Matrix

In [None]:
x=attrition.corr()
x

In [None]:
plt.figure(figsize=(25,20))
sns.heatmap(attrition.corr(), annot=True) # Correlation between the Columns to find the highly correlated columns to remove multi-collinearity
plt.title('Heatmap Correlation between the Input features')

##### Insights for correlation matrix:

We'll consider high correlations (i.e., between 0.7 and 1 for direct correlation and from −1 to −0.7 for inverse correlation): 

Explain findings


## Data Analysis

### Department with highest attrition rates

In [None]:
df=attrition['Attrition'].groupby(attrition['Department']).value_counts(normalize=False)
df

In [None]:
df=df.reset_index()
df

In [None]:
#Need to rename columns so that the values are correctly interpreted
df=df.rename(columns={"Attrition":"Count of Employees","level_1":"Attrition"})
df

In [None]:
sns.barplot(x="Department",y="Count of Employees",hue="Attrition",data=df)
plt.title('Attrition for employees of Different Departments')

In [None]:
# Looking at the values from a proportions perspective
df=attrition['Attrition'].groupby(attrition['Department']).value_counts(normalize=True).reset_index()
df=df.rename(columns={"Attrition":"Count of Employees","level_1":"Attrition"})
sns.barplot(x="Department",y="Count of Employees",hue="Attrition",data=df)
plt.title('Attrition of employees per Department - Proportions')

In [None]:
#Average Environment Satisfaction per Department and attrition. Error bars indicate the 95% ci values (95% ci is the default)
ax = sns.catplot(x = 'Attrition' , hue = 'Department' , y = 'EnvironmentSatisfaction', data = attrition, kind = "bar")
plt.title('Attrition for different department employees considering their EnvironmentSatisfaction')

In [None]:
#Average distance from home per department and attrition. ci=90%
ax = sns.catplot(x = 'Attrition' , hue = 'Department' , y = 'DistanceFromHome', ci=90, data = attrition, kind = "bar")
plt.title('Attrition for different department employees considering their Distance From Home')

In [None]:
### Explain code above. What addtional analysis would you do?

##### What insights does this analysis provide: 

### Job Role with highest Attrition rate

In [None]:
attrition['JobRole'].unique()

In [None]:
new_graph = sns.countplot(attrition['JobRole'])

new_graph.set_xticklabels(labels = attrition['JobRole'].unique(), rotation=90)

plt.title('Employees per Job Role')


In [None]:
#Attrition per Job Role
fig=plt.subplots(figsize=(20,5))  #stretching the plot
df=attrition['Attrition'].groupby(attrition['JobRole']).value_counts(normalize=False).reset_index()
df=df.rename(columns={"Attrition":"Count of Employees","level_1":"Attrition"})
sns.barplot(x="JobRole",y="Count of Employees",hue="Attrition",data=df)
plt.title('Attrition of employees per Job Role')

In [None]:
#Attrition per Job Level
df=attrition['Attrition'].groupby(attrition['JobLevel']).value_counts(normalize=False).reset_index()
df=df.rename(columns={"Attrition":"Count of Employees","level_1":"Attrition"})
sns.barplot(x="JobLevel",y="Count of Employees",hue="Attrition",data=df)
plt.title('Attrition for employees of Different Job Levels')

In [None]:
#Attrition per Job Level - proportions
df=attrition['Attrition'].groupby(attrition['JobLevel']).value_counts(normalize=True).reset_index()
df=df.rename(columns={"Attrition":"Count of Employees","level_1":"Attrition"})
sns.barplot(x="JobLevel",y="Count of Employees",hue="Attrition",data=df)
plt.title('Attrition for employees of Different Job Levels - Proportions')

In [None]:
### Explain above code

##### What insights does this analysis provide:

### Effect of hourly rate of an employee on Attrition

In [None]:
sns.distplot(attrition['HourlyRate'], bins = 20)
plt.title('Distribution of HourlyRate attribute')

In [None]:
# Labels to be used for creating the discrete values for the continuous valued columns 

labels = ['25-45', '45-60', '60-80', '80-110']

# Discretizing the HourlyRate into 4 discrete bins as defined below

attrition['HourlyRate'] = pd.cut(attrition['HourlyRate'], bins = [25,45,60,80,110], labels = labels)


In [None]:
sns.countplot(attrition['HourlyRate'])
plt.title('Discretized HourlyRate attribute distribution')

In [None]:
#Functions to create our attrition graphs 
#Graph with employee counts
def attritiongraph(inpdata, column):
    df=inpdata['Attrition'].groupby(inpdata[column]).value_counts(normalize=False).reset_index()
    df=df.rename(columns={"Attrition":"Count of Employees","level_1":"Attrition"})
    sns.barplot(x=column,y="Count of Employees",hue="Attrition",data=df)

#Graph with proportions of the count of employees
def attritionpropgraph(inpdata, column):
    df=inpdata['Attrition'].groupby(inpdata[column]).value_counts(normalize=True).reset_index()
    df=df.rename(columns={"Attrition":"Count of Employees","level_1":"Attrition"})
    sns.barplot(x=column,y="Count of Employees",hue="Attrition",data=df)
    

In [None]:
#Attrition per Hourly Rate
attritiongraph(attrition,'HourlyRate')
plt.title('Attrition per HourlyRate')

In [None]:
#Attrition per Hourly Rate - proportions
attritionpropgraph(attrition,'HourlyRate')
plt.title('Attrition per HourlyRate')

In [None]:
# Explain above code 

##### What insight doe this analysis provide:

### Impact of Gender on Employee Attrition

In [None]:
sns.countplot(x=attrition.Gender, hue=attrition.Attrition)  #This is a faster way of generating the graphs..
                                                            #but proportions can't be obtained this way.
plt.xlabel('Gender')
plt.ylabel('Employee Count')

In [None]:
sns.catplot(x="Department", y="MonthlyIncome", hue="Attrition", col="Gender", kind="bar", data=attrition)

### What insights does this analysis provide**


### Education field of an Employee 

In [None]:
fig=plt.subplots(figsize=(20,5))
attritiongraph(attrition,'EducationField')
plt.title('Attrition per EducationField')

In [None]:
fig=plt.subplots(figsize=(20,5))
attritionpropgraph(attrition,'EducationField')
plt.title('Attrition per EducationField')

##### What insight does this analysis provide:

### Impact of Marital Status on employee attrition

In [None]:
attrition['MaritalStatus'].unique()


In [None]:
sns.countplot(attrition['MaritalStatus'])
plt.title('MaritalStatus attribute distribution')


In [None]:
fig=plt.subplots(figsize=(20,5))
attritiongraph(attrition,'MaritalStatus')
plt.title('Attrition per MaritalStatus')

##### What insight does this analysis provide?

### Employee attrition based on the number of years they spent at the company

In [None]:
attrition['YearsAtCompany'].unique()

In [None]:
# Discretizing YearsAtCompany Column values into 7  groups 
attrition['YearsAtCompany'] = pd.cut(attrition['YearsAtCompany'], bins = [0,5,10,15,20,25,30], labels = ['0-5','5-10','10-15','15-20','20-25','25-30'])

In [None]:
sns.countplot(attrition['YearsAtCompany'])
plt.title('YearsAtCompany distribution')

In [None]:
fig=plt.subplots(figsize=(20,5))
attritiongraph(attrition,'YearsAtCompany')
plt.title('Attrition vs. YearsAtCompany')

In [None]:
fig=plt.subplots(figsize=(20,5))
attritionpropgraph(attrition,'YearsAtCompany')
plt.title('Attrition vs. YearsAtCompany')

##### What insight does this analysis provide:

### Comments
<p>No analysis of impact of salary hike, age, years since last promotion</p>
Build a predictive model ?