In [None]:
# https://www.kaggle.com/ludobenistant/hr-analytics

In [None]:
import itertools as it
import operator as op
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [None]:
df = pd.read_csv("../data/HR.csv")

In [None]:
df.rename(columns = {
    "average_montly_hours": "avg_monthly_hours",
    "Work_accident": "work_accident",
    "sales": "department"
}, inplace = True)
df

In [None]:
sns.heatmap(df.corr())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[["satisfaction_level", "last_evaluation", "number_project", "avg_monthly_hours", "time_spend_company", "work_accident", "promotion_last_5years"]].values,
    df["left"].values, train_size = 0.8, random_state = 1
)

In [None]:
model_poly = make_pipeline(PolynomialFeatures(3), LinearRegression())
model_poly.fit(X_train, y_train)
predictions = model_poly.predict(X_test)
predictions = np.vectorize(lambda x: 0 if x <= 0.5 else 1)(predictions)

In [None]:
print(sum(predictions == y_test), y_test.shape[0])

In [None]:
# ----

In [None]:
import statsmodels.formula.api as smf

In [None]:
formula = "left ~ satisfaction_level + last_evaluation + number_project + avg_monthly_hours + time_spend_company + work_accident==1 + promotion_last_5years==0 + C(salary) + C(department)"
model = smf.logit(formula, data = df).fit()
model.summary()

In [None]:
model.params.sort_values()
# plot coefficients?
# colormap for blue = neg corr (satisfaction), red = pos corr (low salary), white = no corr (hours/month)?

In [None]:
class CompanyModel(object):
    def __init__(self):
        self.results = pd.DataFrame()

    def train(self, data, formula):
        self.model = smf.logit(formula, data = data).fit()

    def predict(self, keys, vals):
        # model.predict returns probability [0.0 - 1.0] that an employee leaves based on the given conditions;
        # we can extend this probability to be the ratio of employees that leave a company with such conditions
        sample = pd.DataFrame([vals], columns = keys)
        sample["attrition"] = self.model.predict(sample)[0]
        self.results = self.results.append(sample, ignore_index = True)

In [None]:
cm = CompanyModel()
cm.train(df, "left ~ satisfaction_level + last_evaluation + number_project + avg_monthly_hours + time_spend_company + work_accident==0 + promotion_last_5years==0 + C(salary) + C(department)")

In [None]:
params = {
    "satisfaction_level":    np.arange(0.1, 1.1, 0.1),
    "last_evaluation":       np.arange(0.1, 1.1, 0.1),
    "number_project":        range(2, 8, 1),
    "avg_monthly_hours":     range(100, 320, 20),
    "time_spend_company":    range(2, 11, 1),
    "work_accident":         [0, 1],
    "promotion_last_5years": [0, 1],
    "salary":                ["low", "medium", "high"],
    "department":            ["sales", "technical", "support", "IT", "product_mng", "marketing", "RandD", "accounting", "hr", "management"]
}
# 10*10*6*11*9*2*2*3*10 = 7128000
keys, vals = zip(*params.items())
for prod in it.product(*vals):
    cm.predict(keys, prod)

In [None]:
cm.results[cm.results.attrition < 0.05]

In [None]:
# for quit_rate in [0.01, 0.05, 0.1]:
#     print("with at most {0}% of employees quitting, employee conditions can be as poor as {1}!".format(quit_rate * 100, cm.results[cm.results.attrition < quit_rate]))

In [None]:
# "Designing the Optimal Employee Experience"
# see how bad conditions can be for employees before they quit?
# make regression model, figure out how low pay can be (etc.) for each predicted % of employees leaving