Expected outputs are:
*   Logistic Regression and Random Forest Model
*   ROC and AUC for both models
*   treeinterpreter graphs
*   K means clustering graphs for interpretation



In [None]:
#config cell
seed = 47

max_cont_graph = 10 #maximum feats to show in feat contribution graphs
max_features_to_show = 10 #Max Feats to show in Overall View

label = "Attrition"
indiv = "Employee"

min_clus = 5
max_clus = 8
iters = 40 #iterations for randomized search cv

train_path = '../../data/df-june.csv'
test_path = '../../data/df-june.csv'

clus_cols = ['Tenure in Yrs', 'Time in Grade (Yrs)',
'Time in Position', 'Age in Years', 'Average Audio Calls', 
'Average Emails Sent', 'Conglo Compa-ratio']

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score, RandomizedSearchCV, StratifiedKFold

from treeinterpreter import treeinterpreter as ti

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.decomposition import PCA

# Data Cleaning

In [None]:
df = pd.read_csv(train_path, index_col=0)
df_test = pd.read_csv(test_path, index_col=0)

In [None]:
y = df['Status_Withdrawn']
X = df.drop(['Status_Withdrawn', 'Status_Active'], axis=1)
X_test = df.drop(['Status_Withdrawn', 'Status_Active'], axis=1)
feature_names = X.columns

In [None]:
y.value_counts()

In [None]:
indices = X.index

In [None]:
y_test = df_test['Status_Withdrawn'][indices]

In [None]:
scaler = MinMaxScaler().fit(X)
X_scaled = scaler.transform(X)


# Build Random Forest

In [None]:
forest = RandomForestClassifier(class_weight = "balanced", 
                                        n_estimators=500, 
                                        random_state=seed)

In [None]:
#change n_iter to find better regularization parameters

param_grid = {'max_depth': range(3, 10),
              'criterion': ['gini','entropy'],
              'max_features': range(2, 8),
              } 

gs = RandomizedSearchCV(forest, param_grid, cv=5, n_iter=iters, random_state=seed, scoring="roc_auc")

In [None]:
grid_result = gs.fit(X, y)

In [None]:
best_params = gs.best_params_

In [None]:
forest = RandomForestClassifier(max_depth=best_params["max_depth"], criterion=best_params["criterion"], 
                                max_features=best_params["max_features"], verbose=False, n_estimators=500, 
                                class_weight="balanced", random_state=seed)
forest.fit(X, y)

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) 

scores = cross_val_score(forest, X, y, cv=skf)
print(np.mean(scores))

scores = cross_val_score(forest, X, y, cv=skf, scoring="roc_auc")
print(np.mean(scores))


In [None]:
y_score = forest.predict_proba(X.loc[indices])

fpr, tpr, _ = roc_curve(y_test.loc[indices], y_score[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic Random Forest")
plt.legend(loc="lower right")
plt.show()
plt.close('all')

# Visualize Model Attributes

In [None]:
result = permutation_importance(
    forest, X, y, n_repeats=10, random_state=seed, n_jobs=2
) 

In [None]:
forest_importances = pd.DataFrame({'means': result.importances_mean, 'stds': result.importances_std}, index=feature_names)
forest_importances = forest_importances.sort_values('means', ascending=False)

forest_importances['means'][:max_features_to_show].plot.bar(title=f"Feature Importances Using Permutation on Full {label} Model",
                                                            yerr = forest_importances['stds'][:max_features_to_show])

In [None]:
prediction, bias, contributions = ti.predict(forest, X.values)

stack = []

for i in range(len(feature_names)):
  stack.append(contributions[:, i, 1])

stack = np.array(stack)

contri_means = []
contri_stds = []

for i in range(len(feature_names)):
  contri_means.append(stack[i].mean())
  contri_stds.append(stack[i].std())

contrib_df = pd.DataFrame({'means': contri_means, 'stds': contri_stds}, index=feature_names)

contrib_graph_data = contrib_df.loc[forest_importances.index[:max_features_to_show]]

contrib_graph_data['means'].plot.bar(title=f"Understanding Effects of Important Features on {label} Prediction",
                                     yerr = contrib_graph_data['stds'],
                                     grid=True)


In [None]:
for i, j in enumerate(feature_names):
  if j in forest_importances.index[:max_features_to_show]:
    fig, ax = plt.subplots()
    h = ax.hist2d(X[j], stack[i], (50, 50), cmap=plt.cm.jet, cmin=1)
    fig.colorbar(h[3], ax=ax)
    ax.set_title(f'Tree Interpreter Contributions for {j}')
    ax.figure.savefig(f'../../outputs/ccu/treeinterpreter-contribs-{j}.png')
    plt.close('all')

In [None]:
plt.figure()
plt.hist(prediction[:,1]) 
plt.title(f'Distribution of {label} Predictions')
plt.show()
plt.close('all')

# K-means Clustering

In [None]:
def silhoutte_kmeans(df_km):
    ins = []
    sil_avg = []
    cents = []
    labels = []

    for n_clus in range(5, 8):
        km = KMeans(n_clusters=n_clus, random_state=seed, algorithm="lloyd").fit(df_km)
        label = km.predict(df_km)
        labels.append(km.predict(df_km))
        ins.append(km.inertia_)
        sil_avg.append(silhouette_score(df_km, label))
        cents.append(km.cluster_centers_)

    centers = cents[sil_avg.index(max(sil_avg))]
    n_clus = len(centers)
    labels = labels[sil_avg.index(max(sil_avg))]

    return centers, n_clus, labels

In [None]:
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled = X_scaled[clus_cols]

In [None]:
centers_X, n_clus_X, labels_X = silhoutte_kmeans(X_scaled)

In [None]:
pd.Series(labels_X).value_counts().plot.barh(title=f'{indiv} Cluster Distribution (K-means)',
                                     xlabel='Cluster',
                                     ylabel=f'n({indiv})',
                                     rot=0)

In [None]:
an_X = pd.DataFrame(centers_X, columns=X_scaled.columns)

In [None]:
#plot bar graphs of centroids per cluster
for i in range(n_clus_X):
    ax = an_X.iloc[i].sort_values(ascending=False)[:max_features_to_show].plot.barh(title=f"Cluster {i} Centroids")
    plt.tight_layout()
    ax.figure.savefig(f'../../outputs/ccu/clus-{i}-centroids.png')
    plt.close('all')

In [None]:
#determine extent of delineation of clusters
pca = PCA(n_components=2)
pca.fit(X_scaled)
X_transformed = pca.transform(X_scaled)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
print(pca.singular_values_)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X_transformed[:, 0], X_transformed[:, 1], c=labels_X)
plt.show()
plt.close('all')

# sp and sn

In [None]:
y_score = y_score[:, 1]
y_test = y_test.loc[indices]

In [None]:
thresh=0.5

filt = (y_score>=thresh) & (y_test==1)
true_pos = sum(filt)

filt = (y_score<thresh) & (y_test==0)
true_neg = sum(filt)

filt = (y_score>=thresh) & (y_test==0)
false_pos = sum(filt)

filt = (y_score<thresh) & (y_test==1)
false_neg = sum(filt)

print(true_pos)
print(true_neg)
print(false_pos)
print(false_neg)

acc = (true_pos+true_neg)/(true_pos+true_neg+false_pos+false_neg)
sn = true_pos/(true_pos+false_neg)
sp = true_neg/(true_neg+false_pos)

print(acc)
print(sn)
print(sp)

In [None]:
df = pd.DataFrame(contributions[:, :, 1])
df.index = X.index
df.columns = feature_names
df['Bias'] = bias[:, 1]
df['Cluster'] = labels_X

In [None]:
df.to_csv('../../outputs/ccu/all-employees-ccu.csv')