In [None]:
# https://www.kaggle.com/ludobenistant/hr-analytics

In [None]:
import itertools as it
import operator as op
from matplotlib import cm as colmap
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [None]:
df = pd.read_csv("../data/HR.csv")

In [None]:
df.rename(columns = {
    "average_montly_hours": "avg_monthly_hours",
    "Work_accident": "work_accident",
    "sales": "department"
}, inplace = True)
df

In [None]:
sns.heatmap(df.corr())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[["satisfaction_level", "last_evaluation", "number_project", "avg_monthly_hours", "time_spend_company", "work_accident", "promotion_last_5years"]].values,
    df["left"].values, train_size = 0.8, random_state = 1
)

In [None]:
model_poly = make_pipeline(PolynomialFeatures(3), LinearRegression())
model_poly.fit(X_train, y_train)
predictions = model_poly.predict(X_test)
predictions = np.vectorize(lambda x: 0 if x <= 0.5 else 1)(predictions)

In [None]:
print(sum(predictions == y_test), y_test.shape[0])

In [None]:
# ----

In [None]:
import statsmodels.formula.api as smf

In [None]:
formula = "left ~ satisfaction_level + last_evaluation + number_project + avg_monthly_hours + time_spend_company + work_accident==1 + promotion_last_5years==0 + C(salary) + C(department)"
model = smf.logit(formula, data = df).fit()
model.summary()

In [None]:
model.params.sort_values()

In [None]:
# we only want statistically significant features
significant = model.params[model.pvalues < 0.1]

# numerical variables scale inversely affects coefficient scale, so normalize them
significant.satisfaction_level *= df.satisfaction_level.max()
significant.number_project *= df.number_project.max()
significant.avg_monthly_hours *= df.avg_monthly_hours.max()
significant.time_spend_company *= df.time_spend_company.max()
significant = significant.sort_values()

keys, values = significant.index, significant.values
keys = ["[N] Satisfaction Level", "[N] Base Rate (Includes High Salary)", "[N] Number of Projects", "[B] Had a Work Accident", "[B] Works in Misc./Unknown", "[B] Works in Management", "[B] Works in Support", "[B] Works in Tech", "[B] Works in HR", "[N] Last Evaluation", "[N] Avg. Monthly Hours", "[B] Medium Salary", "[B] Not Promoted in Last 5 Years", "[B] Low Salary", "[N] Years at Company"]
# ask Allen how to keep high salary separate from base rate


fig, ax = plt.subplots(figsize = (16, 8))
positions = [p + 0.5 for p in range(len(values))]
b_color = colmap.Blues(np.abs(values[values < 0]))
r_color = colmap.Reds(values[values >= 0])
ax.bar(positions, values, color = np.vstack((b_color, r_color)))
ax.set_xticks([p + 0.4 for p in positions])
ax.set_xticklabels(keys, rotation = 80)
plt.tick_params(labelsize = 16)
plt.xlabel("Feature ([N] for number, [B] for boolean)", fontsize = 20)
plt.ylabel("Quitting Likelihood Coefficient", fontsize = 20)
plt.title("Figure 1: LogReg Model Coefficients", fontsize = 24)
pass

In [None]:
class CompanyModel(object):
    def __init__(self):
        self.results = pd.DataFrame()

    def train(self, data, formula):
        self.model = smf.logit(formula, data = data).fit()

    def predict(self, keys, vals):
        # model.predict returns probability [0.0 - 1.0] that an employee leaves based on the given conditions;
        # we can extend this probability to be the ratio of employees that leave a company with such conditions
        sample = pd.DataFrame([vals], columns = keys)
        sample["attrition"] = self.model.predict(sample)[0]
        self.results = self.results.append(sample, ignore_index = True)

In [None]:
cm = CompanyModel()
cm.train(df, "left ~ satisfaction_level + last_evaluation + number_project + avg_monthly_hours + time_spend_company + work_accident==0 + promotion_last_5years==0 + C(salary) + C(department)")

In [None]:
params = {
    "satisfaction_level":    np.arange(0.0, 1.2, 0.2),
    "last_evaluation":       np.arange(0.0, 1.2, 0.2),
    "number_project":        range(2, 10, 2),
    "avg_monthly_hours":     range(100, 350, 50),
    "time_spend_company":    range(2, 14, 4),
    "work_accident":         [0, 1],
    "promotion_last_5years": [0, 1],
    "salary":                ["low", "medium", "high"],
    "department":            ["sales", "technical", "support", "IT", "product_mng", "marketing", "RandD", "accounting", "hr", "management"]
}
# 6 * 6 * 4 * 5 * 3 * 2 * 2 * 3 * 10 = 259200 cases... long runtime
keys, vals = zip(*params.items())
for prod in it.product(*vals):
    cm.predict(keys, prod)

In [None]:
total = len(cm.results)
for quit_rate in [0.01, 0.05, 0.10]:
    l = len(cm.results[cm.results.attrition < quit_rate])
    print("attrition <= {0}%: {1} / {2} condition combinations ({3:.2f}%)".format(100 * quit_rate, l, total, 100.0 * l / total))

In [None]:
cm.results.sort_values("attrition", ascending = False)

In [None]:
# make example plots of how attrition rate changes when all
# but 1 or 2 variables are held constant (at different levels)?