In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from imblearn.pipeline import Pipeline as im_pipe
from sklearn.pipeline import Pipeline as sk_pipe
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, RepeatedStratifiedKFold, cross_validate
from sklearn.base import clone
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix, roc_auc_score, roc_curve, plot_roc_curve
from sklearn.metrics import auc, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from scipy import interp
import warnings
warnings.filterwarnings('ignore')

In [None]:
employee_df = pd.read_csv('~/Documents/Data Science/2nd Semester/Data Science Bootcamp/Project datasets-20220912/DATASET_-_NOVA_IMS_-_Human-Resources-Employee-Attrition.docx/HR_DS.csv')
employee_df.head()

In [None]:
employee_df.shape

We seem to have little data to work with. We probably cannot divide the data in TRAIN and VALIDATION. Use K-Fold CV.

In [None]:
employee_df.columns.tolist()

In [None]:
employee_df.info()

In [None]:
employee_df.Over18.value_counts()

In [None]:
employee_df.isna().sum()

There are no missing values in our dataset.

In [None]:
employee_df[employee_df.duplicated(keep = False)]

No duplicates found!

### Target Distribution

In [None]:
sns.histplot(data = employee_df.Attrition, legend= True)

We have a imbalanced dataset, since most of our observations have a target of 'No'. We must take this into account when using evaluation metrics.

### Numerical Variables

In [None]:
employee_df.describe().T

EmployeeCount and StandardHours are univariate features and should be dropped as they don't provide relevant information to the problem. 

EmployeeNumber don't seem relevant also, set as index.

In [None]:
employee_df.set_index('EmployeeNumber', inplace= True)
employee_df.head()

In [None]:
employee_df.drop(columns= ['EmployeeCount', 'StandardHours', 'Over18'], axis = 1, inplace = True)

In [None]:
employee_df.skew()

NumCompaniesWorked, PerformanceRating, TotalWorkingYears, YearsAtCompany, YearsSinceLastPromotion seem to be highly skewed and might contain a few outliers.

In [None]:
sns.histplot(employee_df.NumCompaniesWorked, color = 'g')

In [None]:
sns.histplot(employee_df.PerformanceRating, color = 'r')
print(employee_df.PerformanceRating.value_counts())

Weird values in Performance Rating, only 3 and 4 values?

In [None]:
sns.histplot(employee_df.TotalWorkingYears, color = 'b')

In [None]:
sns.histplot(employee_df.YearsAtCompany, color = 'purple')

In [None]:
figure, axes = plt.subplots(2, 2, sharex= False, figsize=(10,5))
figure.suptitle('Boxplots')

sns.boxplot(ax= axes[0, 0], data= employee_df, y= 'NumCompaniesWorked', color= 'g')
sns.boxplot(ax= axes[0, 1], data= employee_df, y= 'PerformanceRating', color= 'r')
sns.boxplot(ax= axes[1, 0], data= employee_df, y= 'TotalWorkingYears', color= 'b')
sns.boxplot(ax= axes[1, 1], data= employee_df, y= 'YearsAtCompany', color= 'purple')

In [None]:
df_corr = employee_df.corr(method = 'pearson')
figure = plt.figure(figsize = (16, 10))
sns.heatmap(df_corr, cmap = 'YlGn',annot = True, fmt = '.1g')

In [None]:
employee_df.MonthlyIncome.hist()

### Cateforical Variables

In [None]:
employee_df.describe(include = ['O']) # incluir as variáveis categóricas no describe

No NaN values found in categorical variables.

As seen previously Attrition possesses for the most time the value "No", people tend to travel rarely, do R&D, come from Life Sciences and are Male most of the time.

The most frequent Job Role is Sales Executive, more than half of employees are Married and more than 2/3 don't do overtime.

# Data Pre Processing

### Dummify Target Variable

In [None]:
employee_df.Attrition.replace(to_replace= ['Yes', 'No'], value= [1, 0], inplace= True)
employee_df.Attrition.value_counts()

### Separate Labels from Features

In [None]:
labels = employee_df[['Attrition']].copy()

In [None]:
len(labels[labels.Attrition == 1]) / len(labels)

In [None]:
employee_df = employee_df.drop('Attrition', axis = 1)
employee_df.head()

### Separate Categorical variables from Numerical

In [None]:
employee_df.dtypes

In [None]:
df_num = employee_df.drop(columns= ['BusinessTravel', 'Department', 'EducationField', 'Gender', 
                                    'JobRole', 'MaritalStatus', 'OverTime'])
df_cat = employee_df[['BusinessTravel', 'Department', 'EducationField', 'Gender', 
                                    'JobRole', 'MaritalStatus', 'OverTime']]

In [None]:
list_num = df_num.columns.tolist()
list_cat = df_cat.columns.tolist()

In [None]:
cat_pipepline = sk_pipe([('encoder', OneHotEncoder(categories= 'auto'))])
num_pipeline = sk_pipe([('std_scaler', StandardScaler())])

In [None]:
employee_cat = cat_pipepline.fit_transform(df_cat) # apply the pipeline to cat data
employee_num = num_pipeline.fit_transform(df_num) # apply the pipeline to num data

In [None]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, list_num),
    ('cat', cat_pipepline, list_cat),
])

In [None]:
employee_array_prepared = full_pipeline.fit_transform(employee_df)

In [None]:
employee_array_prepared

In [None]:
employee_df

#### Transform prepared array back to dataframe

In [None]:
cat_cols_encoded = []
for col in list_cat:
    cat_cols_encoded += [f"{col[0]}_{cat}" for cat in list(employee_df[col].unique())]

cat_cols_encoded

In [None]:
len(employee_cat.toarray()[0])

In [None]:
employee_num

In [None]:
employee_df_prepared_cat = pd.DataFrame(employee_cat.toarray(), columns= cat_cols_encoded, 
                                      index= df_cat.index) ## transformar para um df as categóricas
employee_df_prepared_num = pd.DataFrame(employee_num, columns= df_num.columns,
                                       index= df_num.index) ## transformar para um df as numéricas

employee_df_prepared = employee_df_prepared_num.merge(employee_df_prepared_cat, how= 'left',
                                                     on= employee_df_prepared_num.index)
employee_df_prepared.rename(columns= {"key_0": "EmployeeNumber"}, inplace= True)
employee_df_prepared.set_index('EmployeeNumber', inplace= True)
employee_df_prepared

## Select and Train a Model

### Decision Tree Classifier

### Without SMOTE

In [None]:
dec_tree_clf = DecisionTreeClassifier()
dec_tree_clf.fit(employee_df_prepared, labels)

In [None]:
scores = cross_validate(dec_tree_clf, employee_df_prepared, labels, scoring= ('roc_auc', 'f1_weighted'), cv= 3, n_jobs= -1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

### With SMOTE

In [None]:
steps = [('over', SMOTE()), ('model', DecisionTreeClassifier())]
pipeline = im_pipe(steps= steps)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
scores = cross_validate(pipeline, employee_df_prepared, labels, scoring= ('roc_auc', 'f1_weighted'), cv=cv, n_jobs= -1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

## Random Forest

### Without SMOTE

In [None]:
rf_clf = RandomForestClassifier(n_estimators= 10, class_weight= 'balanced_subsample')
rf_clf.fit(employee_df_prepared, labels)

In [None]:
# Define a evaluation method
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3)

In [None]:
scores = cross_validate(rf_clf, employee_df_prepared, labels, cv= cv, scoring= ('roc_auc', 'f1_weighted'))

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

### With SMOTE

In [None]:
steps = [('over', SMOTE()), ('model', RandomForestClassifier(n_estimators= 10, class_weight= 'balanced_subsample'))]
pipeline = im_pipe(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_validate(pipeline, employee_df_prepared, labels, scoring= ('roc_auc', 'f1_weighted'), cv= cv, n_jobs=-1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

#### Feature Importance - Random Forest

In [None]:
gini_importance_rf = RandomForestClassifier(n_estimators = 10, class_weight= 'balanced_subsample', 
                                            criterion = 'gini')
gini_importance_fit_rf = gini_importance_rf.fit(employee_df_prepared, labels).feature_importances_
entropy_importance_rf = RandomForestClassifier(n_estimators = 10, class_weight= 'balanced_subsample',
                                               criterion = 'entropy')
entropy_importance_fit_rf = entropy_importance_rf.fit(employee_df_prepared, labels).feature_importances_

In [None]:
zippy = pd.DataFrame(zip(gini_importance_fit_rf, entropy_importance_fit_rf), columns = ['gini','entropy'])
zippy['col'] = employee_df_prepared.columns
tidy = zippy.melt(id_vars='col').rename(columns=str.title)
tidy.sort_values(['Value'], ascending = False, inplace = True)

plt.figure(figsize=(30,15))
sns.barplot(y='Col', x='Value', hue='Variable', data=tidy)

## XGBoost

### Without SMOTE

In [None]:
steps = [('model', XGBClassifier(scale_pos_weight= 0.1))]
pipeline = im_pipe(steps= steps)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
scores = cross_validate(pipeline, employee_df_prepared, labels, scoring= ('roc_auc', 'f1_weighted'), cv= cv, n_jobs= -1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

### With SMOTE

In [None]:
steps = [('over', SMOTE()), ('model', XGBClassifier(scale_pos_weight= 0.1))]
pipeline = im_pipe(steps= steps)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
scores = cross_validate(pipeline, employee_df_prepared, labels, scoring= ('roc_auc', 'f1_weighted'), cv=cv, n_jobs= -1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

## AdaBoost

### Without SMOTE

In [None]:
steps = [('model', AdaBoostClassifier())]
pipeline = im_pipe(steps= steps)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
scores = cross_validate(pipeline, employee_df_prepared, labels, scoring= ('roc_auc', 'f1_weighted'), cv= cv, n_jobs= -1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

### With SMOTE

In [None]:
steps = [('over', SMOTE()), ('model', AdaBoostClassifier())]
pipeline = im_pipe(steps= steps)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
scores = cross_validate(pipeline, employee_df_prepared, labels, scoring= ('roc_auc', 'f1_weighted'), cv= cv, n_jobs= -1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

## Model Fit and Feature Importance

In [None]:
pipeline.fit(employee_df_prepared, labels)

In [None]:
pipeline[1].feature_importances_

In [None]:
employee_df_prepared.columns

In [None]:
pd.set_option('display.max_columns', 500)
feat_imp = pd.DataFrame({'Features': employee_df_prepared.columns, 'Importance': pipeline[1].feature_importances_})
feat_imp.T

In [None]:
columns_to_drop = feat_imp[feat_imp.Importance == 0].Features.tolist()

In [None]:
employee_df_prepared_feature_importance = employee_df_prepared.copy()
employee_df_prepared_feature_importance.drop(columns= columns_to_drop, axis= 1, inplace= True)

In [None]:
steps = [('over', SMOTE()), ('model', AdaBoostClassifier())]
pipeline = im_pipe(steps= steps)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
scores = cross_validate(pipeline, employee_df_prepared_feature_importance, labels, scoring= ('roc_auc', 'f1_weighted'), cv= cv, n_jobs= -1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

In [None]:
pipeline.fit(employee_df_prepared_feature_importance, labels)

In [None]:
pipeline[1].feature_importances_

In [None]:
feat_imp_v2 = pd.DataFrame({'Features': employee_df_prepared_feature_importance.columns, 'Importance': pipeline[1].feature_importances_})
feat_imp_v2.T

In [None]:
columns_to_drop = feat_imp_v2[feat_imp_v2.Importance == 0].Features.tolist()

In [None]:
employee_df_prepared_feature_importance_v2 = employee_df_prepared_feature_importance.copy()
employee_df_prepared_feature_importance_v2.drop(columns= columns_to_drop, axis= 1, inplace= True)

In [None]:
steps = [('over', SMOTE()), ('model', AdaBoostClassifier())]
pipeline = im_pipe(steps= steps)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
scores = cross_validate(pipeline, employee_df_prepared_feature_importance_v2, labels, scoring= ('roc_auc', 'f1_weighted'), cv= cv, n_jobs= -1)

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

In [None]:
pipeline.fit(employee_df_prepared_feature_importance_v2, labels)

In [None]:
pipeline[1].feature_importances_

In [None]:
feat_imp_v3 = pd.DataFrame({'Features': employee_df_prepared_feature_importance_v2.columns, 'Importance': pipeline[1].feature_importances_})
feat_imp_v3.T

### Hyperparameter Tuning

In [None]:
steps = [('over', SMOTE()), ('model', AdaBoostClassifier(learning_rate= 0.7, n_estimators= 100,
                                                        algorithm= 'SAMME.R'))]
pipeline = im_pipe(steps= steps)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
scores = cross_validate(pipeline, employee_df_prepared_feature_importance_v2, labels, scoring= ('roc_auc', 'f1_weighted'), cv= cv, n_jobs= -1)
pipeline.fit(employee_df_prepared_feature_importance_v2, labels)

In [None]:
pipeline

In [None]:
print('Mean ROC AUC: %.3f' % np.mean((scores['test_roc_auc'])))
print('Mean F1-Score Weighted: %.3f' % np.mean((scores['test_f1_weighted'])))

### Feature Correlation W/ Target

In [None]:
columns = employee_df_prepared_feature_importance_v2.columns.tolist()
correlations = []
for column in columns:
    correlations.append(labels.corrwith(employee_df_prepared_feature_importance_v2[column], axis = 0)[0])
    
correlations_df = pd.DataFrame({'Feature': columns, 'Correlation W/ Target': correlations, 
                                'Feature Importance': pipeline[1].feature_importances_})
correlations_df

### ROC Curve

In [None]:
x = employee_df_prepared_feature_importance_v2.copy()
y = labels.copy()
x.reset_index(drop= True, inplace= True)
y.reset_index(drop= True, inplace= True)

In [None]:
tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)
plt.figure(figsize=(10 , 7))

i = 1
for train, test in cv.split(x, y):
    prediction = pipeline.fit(x.iloc[train], 
                              y.iloc[train]).predict_proba(x.iloc[test])
    fpr, tpr, t = roc_curve(y.iloc[test], prediction[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=2, alpha=0.3)
    i= i+1

plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='navy',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),lw=4, alpha=1)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate', fontsize= 20)
plt.ylabel('True Positive Rate', fontsize= 20)
plt.title('ROC Curve', fontsize= 20)
plt.legend(loc= "lower right")
plt.show()

## Precision Recall Curve

In [None]:
# defining the lists
prs = []
aucs = []
mean_recall = np.linspace(0, 1, 100)
cv = RepeatedStratifiedKFold(n_splits= 10, n_repeats= 3, random_state= 1)

plt.figure(figsize=(10 , 7))
i = 0
for train, test in cv.split(x, y):
    probas_ = pipeline.fit(x.iloc[train], y.iloc[train]).predict_proba(x.iloc[test])
    # Compute PR curve and area the curve
    precision, recall, thresholds = precision_recall_curve(y.iloc[test], probas_[:, 1])
    prs.append(interp(mean_recall, precision, recall))
    pr_auc = auc(recall, precision)
    aucs.append(pr_auc)
    plt.plot(recall, precision, lw=3, alpha=0.5)
    i += 1

plt.plot([0, 1], [1, 0], linestyle='--', lw=2, color='black')
mean_precision = np.mean(prs, axis=0)
mean_auc = auc(mean_recall, mean_precision)
std_auc = np.std(aucs)
plt.plot(mean_precision, mean_recall, color='navy',
         label=r'Mean (AUCPR = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=4, alpha= 1)

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('Recall', fontsize= 20)
plt.ylabel('Precision', fontsize= 20)
plt.title('PR Curve', fontsize= 20)
plt.legend(loc= "lower right")

plt.show()

## Cluster Analysis

### Create a dataframe with only employees that have quit

In [None]:
employee_df = employee_df.merge(labels, how= 'left', on= 'EmployeeNumber')
quit_df = employee_df[employee_df.Attrition == 1]
quit_df.head()

In [None]:
sns.histplot(quit_df.Age, color = 'orange')

We see that a big portion of people that have quit their job are under 35 years old.

In [None]:
df_subset = employee_df[['Age', 'DailyRate']].copy()
sns.pairplot(df_subset)

In [None]:
df_subset = quit_df[['Age', 'DailyRate']].copy()
sns.pairplot(df_subset)

In [None]:
quit_df['DailyRate'].mean()

###From thee ones that quit, nearly half of them

In [None]:
not_quit_df = employee_df[employee_df.Attrition == 0]
not_quit_df

In [None]:
df_corr = employee_df.corr(method = 'pearson')
figure = plt.figure(figsize = (16, 10))
sns.heatmap(df_corr, cmap = 'YlGn',annot = True, fmt = '.1g')

Chosing variables for cluster: from the correlation map, althouighj there are no variables with high correelation with the target variable, we choose to analyse clusters on the following features:

- Age
- Education
- Monthly income
- Totalworking years
- Worklife balance

### Quitting Cluster

In [None]:
clus_quit_df = quit_df[['Age', 'Education', 'MonthlyIncome', 'TotalWorkingYears', 'WorkLifeBalance']].copy()
clus_quit_df

In [None]:
scaled_clus_quit_df = MinMaxScaler().fit_transform(clus_quit_df)
scaled_clus_quit_df

In [None]:
cols = clus_quit_df.columns
#index = df_v2.index
scaled_clus_quit_df = pd.DataFrame(scaled_clus_quit_df, columns = cols)
scaled_clus_quit_df

In [None]:
ks = range(1, 11) # loop between 1 and 10
inertias = [] # also called sum of squared errors (difference between the points of the data and the centroid)

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)

    # Fit model to samples
    model.fit(scaled_clus_quit_df)

    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)

In [None]:
# Plot ks (x-axis) vs inertias (y-axis) using plt.plot(). 
plt.plot(ks, inertias)

# define the label for x axis as 'number of clusters' using matplotlib.pyplot.xlabel
plt.xlabel('number of clusters')
# define the label for y axis as 'inertia' using matplotlib.pyplot.ylabel
plt.ylabel('inertia')
# define the ticks in x axis using the values of ks
plt.xticks(ks)
# call plt.show()
plt.show()

In [None]:
model_k4 = KMeans(n_clusters = 4, random_state = 0)
scaled_clus_quit_df_k4 = scaled_clus_quit_df.copy() # copy the prod dataset in order to join the label column to the original one
model_k4.fit(scaled_clus_quit_df_k4)

In [None]:
clus_quit_df['label'] = model_k4.labels_
clus_quit_df

In [None]:
clus_quit_df.groupby(['label']).mean().transpose()

In [None]:
clus_quit_df['label'].value_counts()

There are four diferent types of people quiting.

1 - Older employees with really high salaries - probably retired

2 - Lower education, lower working years, good worklife balance

3 - High education, worst Work life balance

4 - higher education, lower salaries - apostar nesta em proximos passos


### Not Quitting Cluster

In [None]:
clus_not_quit_df = not_quit_df[['Age', 'Education', 'MonthlyIncome', 'TotalWorkingYears', 'WorkLifeBalance']].copy()
clus_not_quit_df

In [None]:
scaled_clus_not_quit_df = MinMaxScaler().fit_transform(clus_not_quit_df)
scaled_clus_not_quit_df

In [None]:
cols = clus_not_quit_df.columns
#index = df_v2.index
scaled_clus_not_quit_df = pd.DataFrame(scaled_clus_not_quit_df, columns = cols)
scaled_clus_not_quit_df

In [None]:
ks = range(1, 11) # loop between 1 and 10
inertias = [] # also called sum of squared errors (difference between the points of the data and the centroid)

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)

    # Fit model to samples
    model.fit(scaled_clus_not_quit_df)

    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)

In [None]:
# Plot ks (x-axis) vs inertias (y-axis) using plt.plot(). 
plt.plot(ks, inertias)

# define the label for x axis as 'number of clusters' using matplotlib.pyplot.xlabel
plt.xlabel('number of clusters')
# define the label for y axis as 'inertia' using matplotlib.pyplot.ylabel
plt.ylabel('inertia')
# define the ticks in x axis using the values of ks
plt.xticks(ks)
# call plt.show()
plt.show()

In [None]:
scaled_clus_not_quit_df_k4 = scaled_clus_not_quit_df.copy() # copy the prod dataset in order to join the label column to the original one
model_k4.fit(scaled_clus_not_quit_df_k4)

In [None]:
clus_not_quit_df['label'] = model_k4.labels_
clus_not_quit_df

In [None]:
clus_not_quit_df.groupby(['label']).mean().transpose()

In [None]:
clus_not_quit_df['label'].value_counts()