In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('HR-Employee-Attrition.csv')

In [None]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
df.shape

(1470, 35)

# Using One Hot Encoding for Categorical Data

In [5]:
df['BusinessTravel']=df['BusinessTravel'].replace(['Non-Travel','Travel_Rarely','Travel_Frequently'], [0,1,2])
# Departments
df['Department']=df['Department'].astype('category')
dummies=pd.get_dummies(df['Department'],prefix='Department')
df = pd.concat([df, dummies], axis=1)
del df['Department']
# Education Field
df['EducationField']=df['EducationField'].astype('category')
dummies=pd.get_dummies(df['EducationField'],prefix='EducationField')
df = pd.concat([df, dummies], axis=1)
del df['EducationField']
# Gender
df['Gender']=df['Gender'].astype('category')
dummies=pd.get_dummies(df['Gender'],prefix='Gender')
df = pd.concat([df, dummies], axis=1)
del df['Gender']
# JobRole
df['JobRole']=df['JobRole'].astype('category')
dummies=pd.get_dummies(df['JobRole'],prefix='JobRole')
df = pd.concat([df, dummies], axis=1)
del df['JobRole']
# MaritalStatus
df['MaritalStatus']=df['MaritalStatus'].astype('category')
dummies=pd.get_dummies(df['MaritalStatus'],prefix='MaritalStatus')
df = pd.concat([df, dummies], axis=1)
del df['MaritalStatus']
# Dropping over 18 as its giving no data
del df['Over18']
#Dropping standard hours as it makes no sense
del df['StandardHours']


df['OverTime']=df['OverTime'].replace(['No','Yes'], [0,1])

df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,41,Yes,1,1102,1,2,1,1,2,94,...,False,False,False,False,False,True,False,False,False,True
1,49,No,2,279,8,1,1,2,3,61,...,False,False,False,False,True,False,False,False,True,False
2,37,Yes,1,1373,2,2,1,4,4,92,...,True,False,False,False,False,False,False,False,False,True
3,33,No,2,1392,3,4,1,5,4,56,...,False,False,False,False,True,False,False,False,True,False
4,27,No,1,591,2,1,1,7,1,40,...,True,False,False,False,False,False,False,False,True,False


In [6]:
df.shape

(1470, 51)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 51 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Age                                1470 non-null   int64 
 1   Attrition                          1470 non-null   object
 2   BusinessTravel                     1470 non-null   int64 
 3   DailyRate                          1470 non-null   int64 
 4   DistanceFromHome                   1470 non-null   int64 
 5   Education                          1470 non-null   int64 
 6   EmployeeCount                      1470 non-null   int64 
 7   EmployeeNumber                     1470 non-null   int64 
 8   EnvironmentSatisfaction            1470 non-null   int64 
 9   HourlyRate                         1470 non-null   int64 
 10  JobInvolvement                     1470 non-null   int64 
 11  JobLevel                           1470 non-null   int64 
 12  JobSat

In [8]:
df['Attrition']=df['Attrition'].replace(['Yes', 'No'], [1,0])
df['Attrition']=df['Attrition'].astype('category')

# The data is imbalance, because the data is biased towards non-attrition



In [9]:
(df['Attrition'].value_counts()/1470)*100

Attrition
0    83.877551
1    16.122449
Name: count, dtype: float64

In [10]:
#Splitting the Data into Independent and Dependent Variable
X = df.drop(['Attrition'], axis=1)
Y = df[['Attrition']]

In [11]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(X, Y)
print("Before Smoote" , Y.value_counts())
print()
print("After Smoote" , y_smote.value_counts())

Before Smoote Attrition
0            1233
1             237
Name: count, dtype: int64

After Smoote Attrition
0            1233
1            1233
Name: count, dtype: int64


In [12]:
from sklearn.model_selection import train_test_split
# X is eature set, Y is target variable
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [13]:
# Feature Scaling for X variables

from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train = scaler.fit_transform(X_train)

# Transform the test data
X_test = scaler.transform(X_test)

In [14]:
#print the x_train and y_train for inspection purposes

# Print X_train
print(X_train)

# Print Y_train
print(Y_train)

[[-0.75016842 -0.15721149 -0.58244694 ...  1.86384362 -0.90267093
  -0.69784971]
 [-0.41863372 -0.15721149 -1.12354988 ... -0.5365257  -0.90267093
   1.43297329]
 [ 0.90750511 -0.15721149 -0.04626313 ... -0.5365257  -0.90267093
   1.43297329]
 ...
 [ 0.68648197 -0.15721149  0.91542436 ... -0.5365257   1.10782342
  -0.69784971]
 [ 0.13392413 -0.15721149 -1.3252337  ... -0.5365257   1.10782342
  -0.69784971]
 [ 0.35494726 -0.15721149 -0.36600577 ...  1.86384362 -0.90267093
  -0.69784971]]
     Attrition
338          0
363          1
759          0
793          0
581          0
...        ...
763          0
835          0
1216         0
559          0
684          0

[1029 rows x 1 columns]


# Use logistic regression model

In [15]:
#Build Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression
LR_classifier = LogisticRegression(random_state=0)
LR_classifier.fit(X_train, Y_train)

In [16]:
#Predict the X_test
Y_pred = LR_classifier.predict(X_test)

In [17]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[360  11]
 [ 38  32]]


In [18]:
from sklearn.metrics import classification_report

print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))



Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.94       371
           1       0.74      0.46      0.57        70

    accuracy                           0.89       441
   macro avg       0.82      0.71      0.75       441
weighted avg       0.88      0.89      0.88       441



The accuracy of 89% indicates that the model correctly predicts the class for an instance 89% of the time across all classes.

# Use decision tree model



In [19]:
from sklearn.tree import DecisionTreeClassifier

# Instantiate the DecisionTreeClassifier
DT_classifier = DecisionTreeClassifier(random_state=0)

# Train the classifier
DT_classifier.fit(X_train, Y_train)

In [20]:
# Predict using the Decision Tree classifier
Y_pred = DT_classifier.predict(X_test)

In [21]:
from sklearn.metrics import confusion_matrix

# Assuming Y_test and Y_pred are already defined
conf_matrix = confusion_matrix(Y_test, Y_pred)

print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[318  53]
 [ 46  24]]


In [22]:
from sklearn.metrics import classification_report

print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.86      0.87       371
           1       0.31      0.34      0.33        70

    accuracy                           0.78       441
   macro avg       0.59      0.60      0.60       441
weighted avg       0.78      0.78      0.78       441



The accuracy of 78% indicates that the model correctly predicts the class for an instance 78% of the time across all classes.

# Use SVM Model

In [24]:
from sklearn.svm import SVC

# Instantiate the SVC (Support Vector Classifier)
SVM_classifier = SVC(random_state=0)

# Train the classifier
SVM_classifier.fit(X_train, Y_train)

In [26]:
# Predict using the SVM classifier
Y_pred = SVM_classifier.predict(X_test)


In [27]:
# Calculate the confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)

In [28]:
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[370   1]
 [ 58  12]]


In [29]:
# Print the classification report
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.93       371
           1       0.92      0.17      0.29        70

    accuracy                           0.87       441
   macro avg       0.89      0.58      0.61       441
weighted avg       0.87      0.87      0.83       441



The accuracy of 87% indicates that the model correctly predicts the class for an instance 87% of the time across all classes.

# Conclusion



1.   The dataset is imbalanced by class since we have 83% who have not left the company and 17% who have left the company.
2. The age group of IBM employees is concentrate between 25-45 years.
3. Attrition is more common in the younger age and it is more likely with females. Also, it is more common among single Employees.
4. People get lower opportunities to travel the company tend to leave the company.
5. People with very high education tend to have lower attrition.
6. By using logistic regression model, we achieve the accuracy of 89%, whereas for decision tree model, we achieve the accuracy of 78%, and for SVM model, we achieve the accuracy of 87%.


