In [1]:
# Import Modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Display all of the columns
pd.set_option('display.max_columns', None)

In [2]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path('employee_churn_data.csv')
)
df.head()

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.86607,no
1,operations,0,0.7519,3,medium,6.0,0.443679,0,182.708149,no
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084,no
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545,no
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083,no


In [3]:
# Split target column from dataset
y = df['left']
X = df.drop(columns='left')

In [4]:
# Preview the data
X[:5]

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.86607
1,operations,0,0.7519,3,medium,6.0,0.443679,0,182.708149
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083


In [5]:
# Print first five entries for target
y[:5]

0    no
1    no
2    no
3    no
4    no
Name: left, dtype: object

In [6]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [7]:
X.head()

Unnamed: 0,promoted,review,projects,tenure,satisfaction,bonus,avg_hrs_month,department_IT,department_admin,department_engineering,department_finance,department_logistics,department_marketing,department_operations,department_retail,department_sales,department_support,salary_high,salary_low,salary_medium
0,0,0.577569,3,5.0,0.626759,0,180.86607,False,False,False,False,False,False,True,False,False,False,False,True,False
1,0,0.7519,3,6.0,0.443679,0,182.708149,False,False,False,False,False,False,True,False,False,False,False,False,True
2,0,0.722548,3,6.0,0.446823,0,184.416084,False,False,False,False,False,False,False,False,False,True,False,False,True
3,0,0.675158,4,8.0,0.440139,0,188.707545,False,False,False,False,True,False,False,False,False,False,True,False,False
4,0,0.676203,3,5.0,0.577607,1,179.821083,False,False,False,False,False,False,False,False,True,False,True,False,False


In [8]:
X.shape

(9540, 20)

In [9]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
X_train.shape

(7155, 20)

In [11]:
X_test.shape

(2385, 20)

In [12]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [14]:
# Train the model
model.fit(X_train_scaled, y_train)

In [15]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

array(['no', 'no', 'yes', ..., 'no', 'no', 'no'], dtype=object)

In [16]:
# Print confusion matrix
matrix = confusion_matrix(y_pred,y_test)
print(matrix)

# Calculating the accuracy score
accuracy_score(y_test, y_pred)

[[1444  278]
 [ 232  431]]


0.7861635220125787

In [17]:
# Print classification report
print("Confusion Matrix")
display(matrix)
print("")
print(f"Accuracy Score : {accuracy_score(y_test, y_pred)}")
print("")
print("Classification Report")
print(classification_report(y_pred, y_test, target_names = ['not leaving', 'leaving']))

Confusion Matrix


array([[1444,  278],
       [ 232,  431]], dtype=int64)


Accuracy Score : 0.7861635220125787

Classification Report
              precision    recall  f1-score   support

 not leaving       0.86      0.84      0.85      1722
     leaving       0.61      0.65      0.63       663

    accuracy                           0.79      2385
   macro avg       0.73      0.74      0.74      2385
weighted avg       0.79      0.79      0.79      2385

