In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *

Let us read our test set and deploy our column transformer and model.

In [2]:
test = pd.read_csv('employee_churn_test.csv')
test.head(15)

Unnamed: 0.1,Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,0,marketing,0,0.634404,3,medium,6.0,0.229932,0,182.164361,no
1,1,sales,0,0.707144,4,high,6.0,0.410709,0,183.50505,no
2,2,marketing,0,0.770763,3,high,7.0,0.708192,1,187.423783,yes
3,3,retail,0,0.776789,3,high,7.0,0.170167,0,185.552216,yes
4,4,sales,0,0.600934,4,medium,7.0,0.53409,1,186.735804,no
5,5,engineering,0,0.706875,3,low,9.0,0.546038,0,192.259457,no
6,6,sales,0,0.830927,4,medium,4.0,0.383397,0,177.407245,yes
7,7,retail,0,0.656803,3,medium,5.0,0.516598,0,180.827321,no
8,8,sales,1,0.538342,3,medium,8.0,0.339149,0,187.906683,no
9,9,IT,0,0.505262,3,medium,9.0,0.405539,1,190.757032,no


In [3]:
#Remove extra column
test = test.iloc[:,1:]
test.head(15)

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,marketing,0,0.634404,3,medium,6.0,0.229932,0,182.164361,no
1,sales,0,0.707144,4,high,6.0,0.410709,0,183.50505,no
2,marketing,0,0.770763,3,high,7.0,0.708192,1,187.423783,yes
3,retail,0,0.776789,3,high,7.0,0.170167,0,185.552216,yes
4,sales,0,0.600934,4,medium,7.0,0.53409,1,186.735804,no
5,engineering,0,0.706875,3,low,9.0,0.546038,0,192.259457,no
6,sales,0,0.830927,4,medium,4.0,0.383397,0,177.407245,yes
7,retail,0,0.656803,3,medium,5.0,0.516598,0,180.827321,no
8,sales,1,0.538342,3,medium,8.0,0.339149,0,187.906683,no
9,IT,0,0.505262,3,medium,9.0,0.405539,1,190.757032,no


In [4]:
from pickle import load
ct = load(open('column_transformer.pkl', 'rb'))
preprocessed_test = ct.transform(test)

In [5]:
preprocessed_test = pd.DataFrame(preprocessed_test,
                                columns=['IT', 'admin', 'engineering', 'finance', 'logistics', 'marketing', 'operations', 'retail', 'sales', 'support', 'review', 'projects', 'tenure', 'satisfaction', 'avg_hrs_month', 'promoted', 'salary', 'bonus', 'left'])

In [6]:
preprocessed_test.head()

Unnamed: 0,IT,admin,engineering,finance,logistics,marketing,operations,retail,sales,support,review,projects,tenure,satisfaction,avg_hrs_month,promoted,salary,bonus,left
0,0,0,0,0,0,1,0,0,0,0,-0.197564,-0.478052,-0.386689,-1.729275,-0.59264,0,medium,0,no
1,0,0,0,0,0,0,0,0,1,0,0.649629,1.253553,-0.386689,-0.593574,-0.270571,0,high,0,no
2,0,0,0,0,0,1,0,0,0,0,1.390588,-0.478052,0.316985,1.275317,0.670813,0,high,1,yes
3,0,0,0,0,0,0,0,1,0,0,1.460778,-0.478052,0.316985,-2.10474,0.221213,0,high,0,yes
4,0,0,0,0,0,0,0,0,1,0,-0.587372,1.253553,0.316985,0.181546,0.505543,0,medium,1,no


We only need a subset of these columns, namely Review, Tenure, Satisfaction, and Average Hours a Month.

In [7]:
#Use important columns only
test_filtered = preprocessed_test[['review','tenure','satisfaction','avg_hrs_month','left']]
recode = {'yes':1,'no':0}
test_filtered['left'] = test_filtered['left'].map(recode)
test_filtered.head(15)

Unnamed: 0,review,tenure,satisfaction,avg_hrs_month,left
0,-0.197564,-0.386689,-1.729275,-0.59264,0
1,0.649629,-0.386689,-0.593574,-0.270571,0
2,1.390588,0.316985,1.275317,0.670813,1
3,1.460778,0.316985,-2.10474,0.221213,1
4,-0.587372,0.316985,0.181546,0.505543,0
5,0.646501,1.724334,0.256606,1.832471,0
6,2.091311,-1.794037,-0.765159,-1.735426,1
7,0.06332,-1.090363,0.071655,-0.913833,0
8,-1.316373,1.02066,-1.043139,0.786819,0
9,-1.701649,1.724334,-0.626057,1.471549,0


In [8]:
test.shape

(1908, 10)

In [9]:
test_setup = setup(data=test_filtered,target='left')

Unnamed: 0,Description,Value
0,session_id,4446
1,Target,left
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(1908, 5)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [10]:
#Load model
loaded_model = load_model('employee_churn_model')

Transformation Pipeline and Model Successfully Loaded


In [11]:
predict_model(loaded_model,data=test_filtered)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.8422,0.922,0.8188,0.7046,0.7575,0.6415,0.6455


Unnamed: 0,review,tenure,satisfaction,avg_hrs_month,left,Label,Score
0,-0.197564,-0.386689,-1.729275,-0.59264,0,0.0,0.9946
1,0.649629,-0.386689,-0.593574,-0.270571,0,1.0,0.7463
2,1.390588,0.316985,1.275317,0.670813,1,1.0,0.9970
3,1.460778,0.316985,-2.10474,0.221213,1,1.0,0.5905
4,-0.587372,0.316985,0.181546,0.505543,0,1.0,0.5589
...,...,...,...,...,...,...,...
1903,0.305622,1.724334,0.081787,1.836521,0,0.0,0.9973
1904,-1.099987,-1.090363,1.934001,-0.968264,0,0.0,0.9997
1905,0.484179,0.316985,-0.934961,0.072052,0,1.0,0.6239
1906,0.449716,0.316985,-0.604026,0.472765,0,0.0,0.8513


The AUC on the test set is 0.922 while accuracy is at 84.22%. Recall is at 81.88%.