In [1]:
import pandas as pd
from pathlib import Path
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in the CSV file as a Pandas DataFrame
employee_churn_df = pd.read_csv(
    Path("employee_churn_data.csv")
)

# Review the DataFrame
employee_churn_df.head()

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,operations,0,0.577569,3,low,5.0,0.626759,0,180.86607,no
1,operations,0,0.7519,3,medium,6.0,0.443679,0,182.708149,no
2,support,0,0.722548,3,medium,6.0,0.446823,0,184.416084,no
3,logistics,0,0.675158,4,high,8.0,0.440139,0,188.707545,no
4,sales,0,0.676203,3,high,5.0,0.577607,1,179.821083,no


In [3]:
employee_churn_df.dtypes

department        object
promoted           int64
review           float64
projects           int64
salary            object
tenure           float64
satisfaction     float64
bonus              int64
avg_hrs_month    float64
left              object
dtype: object

In [4]:
# Scaling the numeric columns
employee_churn_scaled = StandardScaler().fit_transform(employee_churn_df[["promoted", "review", "projects", "tenure", "satisfaction", "bonus", "avg_hrs_month"]])

# Creating a DataFrame with with the scaled data
df_churn_transformed = pd.DataFrame(employee_churn_scaled, columns=["promoted", "review", "projects", "tenure", "satisfaction", "bonus", "avg_hrs_month"])

# Display sample data
df_churn_transformed.head()


Unnamed: 0,promoted,review,projects,tenure,satisfaction,bonus,avg_hrs_month
0,-0.176748,-0.870524,-0.474599,-1.099499,0.770211,-0.518771,-0.915767
1,-0.176748,1.173161,-0.474599,-0.392964,-0.384527,-0.518771,-0.471316
2,-0.176748,0.829076,-0.474599,-0.392964,-0.364695,-0.518771,-0.05923
3,-0.176748,0.273521,1.252203,1.020106,-0.406856,-0.518771,0.9762
4,-0.176748,0.28577,-0.474599,-1.099499,0.460198,1.927633,-1.167898


In [5]:
employee_churn_df["salary"].value_counts()

salary
medium    6611
high      1548
low       1381
Name: count, dtype: int64

In [6]:
# Transform the Department Type column using get_dummies()
department_dummies = pd.get_dummies(employee_churn_df["department"])

# Display sample data
department_dummies.head()

Unnamed: 0,IT,admin,engineering,finance,logistics,marketing,operations,retail,sales,support
0,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,True,False,False,False,False,False
4,False,False,False,False,False,False,False,False,True,False


In [7]:
# Transform the Salary Type column using get_dummies()
salary_dummies = pd.get_dummies(employee_churn_df["salary"])

# Display sample data
salary_dummies.head()

Unnamed: 0,high,low,medium
0,False,True,False
1,False,False,True
2,False,False,True
3,True,False,False
4,True,False,False


In [8]:
# Transform the Salary Type column using get_dummies()
left_dummies = pd.get_dummies(employee_churn_df["left"])

# Display sample data
left_dummies.head()

Unnamed: 0,no,yes
0,True,False
1,True,False
2,True,False
3,True,False
4,True,False


In [9]:
# Replace the original data with the columns of information from the scaled Data
employee_churn_df = pd.concat([department_dummies, salary_dummies, left_dummies, df_churn_transformed], axis=1)

# Review the DataFrame
employee_churn_df

Unnamed: 0,IT,admin,engineering,finance,logistics,marketing,operations,retail,sales,support,...,medium,no,yes,promoted,review,projects,tenure,satisfaction,bonus,avg_hrs_month
0,False,False,False,False,False,False,True,False,False,False,...,False,True,False,-0.176748,-0.870524,-0.474599,-1.099499,0.770211,-0.518771,-0.915767
1,False,False,False,False,False,False,True,False,False,False,...,True,True,False,-0.176748,1.173161,-0.474599,-0.392964,-0.384527,-0.518771,-0.471316
2,False,False,False,False,False,False,False,False,False,True,...,True,True,False,-0.176748,0.829076,-0.474599,-0.392964,-0.364695,-0.518771,-0.059230
3,False,False,False,False,True,False,False,False,False,False,...,False,True,False,-0.176748,0.273521,1.252203,1.020106,-0.406856,-0.518771,0.976200
4,False,False,False,False,False,False,False,False,True,False,...,False,True,False,-0.176748,0.285770,-0.474599,-1.099499,0.460198,1.927633,-1.167898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9535,False,False,False,False,False,False,True,False,False,False,...,True,False,True,-0.176748,-0.478744,1.252203,1.020106,0.245959,-0.518771,0.843062
9536,False,False,False,False,True,False,False,False,False,False,...,True,False,True,-0.176748,1.114402,-0.474599,1.020106,0.280068,-0.518771,0.847991
9537,False,False,False,False,False,False,True,False,False,False,...,False,False,True,-0.176748,-1.100166,-0.474599,0.313571,1.266378,-0.518771,0.451052
9538,True,False,False,False,False,False,False,False,False,False,...,True,False,True,-0.176748,-0.789903,1.252203,1.020106,0.647395,1.927633,0.718957


In [10]:
# Remove the redundant no column
employee_churn_df = employee_churn_df.drop(columns=["no"])
employee_churn_df.head()

Unnamed: 0,IT,admin,engineering,finance,logistics,marketing,operations,retail,sales,support,...,low,medium,yes,promoted,review,projects,tenure,satisfaction,bonus,avg_hrs_month
0,False,False,False,False,False,False,True,False,False,False,...,True,False,False,-0.176748,-0.870524,-0.474599,-1.099499,0.770211,-0.518771,-0.915767
1,False,False,False,False,False,False,True,False,False,False,...,False,True,False,-0.176748,1.173161,-0.474599,-0.392964,-0.384527,-0.518771,-0.471316
2,False,False,False,False,False,False,False,False,False,True,...,False,True,False,-0.176748,0.829076,-0.474599,-0.392964,-0.364695,-0.518771,-0.05923
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,-0.176748,0.273521,1.252203,1.020106,-0.406856,-0.518771,0.9762
4,False,False,False,False,False,False,False,False,True,False,...,False,False,False,-0.176748,0.28577,-0.474599,-1.099499,0.460198,1.927633,-1.167898


In [11]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [12]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(employee_churn_df)
    inertia.append(k_model.inertia_)



In [13]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow

Unnamed: 0,k,inertia
0,1,81508.271488
1,2,67598.042268
2,3,58480.430495
3,4,51988.799381
4,5,46632.961596
5,6,43549.789796
6,7,41122.726658
7,8,39290.902899
8,9,37696.068307
9,10,36399.25998


In [14]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k",
    y="inertia",
    title="Elbow Curve",
    xticks=k
)

In [15]:
# Define the model with 5 clusters
model = KMeans(n_clusters=5, random_state=3)

# Fit the model
model.fit(employee_churn_df)

# Make predictions
k_5 = model.predict(employee_churn_df)

# Create a copy of the preprocessed data
predictions_df = employee_churn_df.copy()

# Add a class column with the labels
predictions_df['segments'] = k_5

# Add clusters to original dataset
employee_churn_df['segments'] = predictions_df['segments']



In [16]:
# Plot the clusters
employee_churn_df.hvplot.scatter(
    x="tenure",
    y="yes",
    by="segments"
)

In [17]:
# Define the model with 5 clusters
model = KMeans(n_clusters=2, random_state=3)

# Fit the model
model.fit(employee_churn_df)

# Make predictions
k_2 = model.predict(employee_churn_df)

# Create a copy of the preprocessed data
predictions_df = employee_churn_df.copy()

# Add a class column with the labels
predictions_df['segments'] = k_2

# Add clusters to original dataset
employee_churn_df['segments'] = predictions_df['segments']



In [18]:
# Plot the clusters
employee_churn_df.hvplot.scatter(
    x="promoted",
    y="yes",
    by="segments")

In [19]:

### Alternate version using another clustering algorithm : DBScan

In [20]:
from sklearn.cluster import DBSCAN

# Define the DBSCAN model
# Specify the epsilon (eps) parameter and the minimum samples (min_samples) parameter
# Adjust these parameters based on your data and clustering needs
model = DBSCAN(eps=1, min_samples=50)

# Fit the DBSCAN model
model.fit(employee_churn_df)

# Make predictions
dbscan_labels = model.labels_

# Create a copy of the DataFrame and name it as spread_df_predictions
predictions_df = employee_churn_df.copy()

# Add a class column with the labels to the spread_df_predictions DataFrame
predictions_df['DBS_clusters'] = dbscan_labels


In [21]:
# Plot the clusters
predictions_df.hvplot.scatter(
    x="bonus",
    y="yes",
    by="DBS_clusters"
).opts(yformatter="%.0f")

In [22]:
# Import the PCA module
from sklearn.decomposition import PCA

In [23]:
# Instantiate the PCA instance and declare the number of PCA variables
pca=PCA(n_components=2)

In [24]:
# Fit the PCA model on the transformed credit card DataFrame
employees_pca = pca.fit_transform(employee_churn_df)

# Review the first 5 rows of the array of list data
employees_pca[:5]

array([[-1.44765873,  1.16266742],
       [-0.58283905, -1.16987935],
       [-0.25929824, -0.90257572],
       [ 1.50989064, -0.47993114],
       [-1.8105975 ,  0.24247721]])

In [25]:
# Calculate the PCA explained variance ratio
pca.explained_variance_ratio_

array([0.24943315, 0.15716093])

In [26]:
sum(pca.explained_variance_ratio_)

0.4065940785589692

In [27]:
# Create the PCA DataFrame
churn_pca_df = pd.DataFrame(
    employees_pca,
    columns=["PCA1", "PCA2"]
)

# Review the PCA DataFrame
churn_pca_df.head()

Unnamed: 0,PCA1,PCA2
0,-1.447659,1.162667
1,-0.582839,-1.169879
2,-0.259298,-0.902576
3,1.509891,-0.479931
4,-1.810598,0.242477


In [28]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [29]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=3)
    k_model.fit(churn_pca_df)
    inertia.append(k_model.inertia_)



In [30]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}

# Create the DataFrame from the elbow data
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,34102.405541
1,2,19167.086671
2,3,12217.448574
3,4,9833.715587
4,5,8131.662729


In [31]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [32]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=3, random_state=3)

# Fit the model
model.fit(churn_pca_df)

# Make predictions
k_3 = model.predict(churn_pca_df)

# Create a copy of the customers_pca_df DataFrame
churn_pca_predictions_df = churn_pca_df.copy()

# Add a class column with the labels
churn_pca_predictions_df["employee_segments"] = k_3



In [33]:
# Plot the clusters
churn_pca_predictions_df.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="employee_segments"
)

In [34]:
# Define the model Kmeans model using k=3 clusters
model = KMeans(n_clusters=3, random_state=3)

# Fit the model
model.fit(employee_churn_df)

# Make predictions
k_3 = model.predict(employee_churn_df)

# Create a copy of the customers_transformed_df DataFrame
churn_transformed_predictions_df = employee_churn_df.copy()

# Add a class column with the labels
churn_transformed_predictions_df["employee_segments"] = k_3



In [35]:
churn_transformed_predictions_df

Unnamed: 0,IT,admin,engineering,finance,logistics,marketing,operations,retail,sales,support,...,yes,promoted,review,projects,tenure,satisfaction,bonus,avg_hrs_month,segments,employee_segments
0,False,False,False,False,False,False,True,False,False,False,...,False,-0.176748,-0.870524,-0.474599,-1.099499,0.770211,-0.518771,-0.915767,0,1
1,False,False,False,False,False,False,True,False,False,False,...,False,-0.176748,1.173161,-0.474599,-0.392964,-0.384527,-0.518771,-0.471316,1,1
2,False,False,False,False,False,False,False,False,False,True,...,False,-0.176748,0.829076,-0.474599,-0.392964,-0.364695,-0.518771,-0.059230,1,1
3,False,False,False,False,True,False,False,False,False,False,...,False,-0.176748,0.273521,1.252203,1.020106,-0.406856,-0.518771,0.976200,1,0
4,False,False,False,False,False,False,False,False,True,False,...,False,-0.176748,0.285770,-0.474599,-1.099499,0.460198,1.927633,-1.167898,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9535,False,False,False,False,False,False,True,False,False,False,...,True,-0.176748,-0.478744,1.252203,1.020106,0.245959,-0.518771,0.843062,1,0
9536,False,False,False,False,True,False,False,False,False,False,...,True,-0.176748,1.114402,-0.474599,1.020106,0.280068,-0.518771,0.847991,1,0
9537,False,False,False,False,False,False,True,False,False,False,...,True,-0.176748,-1.100166,-0.474599,0.313571,1.266378,-0.518771,0.451052,1,0
9538,True,False,False,False,False,False,False,False,False,False,...,True,-0.176748,-0.789903,1.252203,1.020106,0.647395,1.927633,0.718957,1,0


In [36]:
# Plot the clusters using the first two feature columns
churn_transformed_predictions_df.hvplot.scatter(
    x="avg_hrs_month",
    y="satisfaction",
    by="employee_segments"
)