In [1]:
%config Completer.use_jedi = False

In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import joblib

In [5]:
df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [7]:
df.shape

(1470, 35)

In [8]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [20]:
#encode target variable to 1 and 0 for yes and no 
le = LabelEncoder()
df['Attrition'] =le.fit_transform(df['Attrition'])

In [22]:
#Separate feature and target
X = df.drop(columns=['Attrition'])
y = df['Attrition']

In [23]:
#Seperate numerical and categorical cols 

numerical_cols = X.select_dtypes(include=['int64','float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

In [25]:
# Preprocess Data

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])
X = preprocessor.fit_transform(X)


In [27]:
#split the training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)


In [28]:
# creating logistic regression model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [30]:
model_lr = LogisticRegression(max_iter = 1000)
model_lr.fit(X_train,y_train)
y_pred = model_lr.predict(X_test)

In [31]:
# evalutate model performance

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy:--> {accuracy}')
print(f'Precision:--> {precision}')
print(f'Recall_Score:--> {recall}')
print(f'F1_Score:--> {f1}')


Accuracy:--> 0.891156462585034
Precision:--> 0.6206896551724138
Recall_Score:--> 0.46153846153846156
F1_Score:--> 0.5294117647058824


In [32]:
# Lets try to improve accuracy using random forest and see how it works

from sklearn.ensemble import RandomForestClassifier


In [39]:
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

In [40]:
# Evaluating
accuracy_rf = accuracy_score(y_test, y_pred)
precision_rf = precision_score(y_test, y_pred)
recall_rf = recall_score(y_test, y_pred)
f1_rf = f1_score(y_test, y_pred)

print(f'RandomForest Accuracy:--> {accuracy_rf}')
print(f'RandomForest Precision:--> {precision_rf}')
print(f'RandomForest Recall_Score:--> {recall_rf}')
print(f'RandomForest F1_Score:--> {f1_rf}')

RandomForest Accuracy:--> 0.8775510204081632
RandomForest Precision:--> 0.6666666666666666
RandomForest Recall_Score:--> 0.15384615384615385
RandomForest F1_Score:--> 0.25


In [42]:
# As we can see our logistic regression model performed better so we will save that model.

# we will save our lr model to use it later.

joblib.dump(model_lr, 'employee_attrition.pkl')

['employee_attrition.pkl']