# Employee Attrition Prediction

#### Add import statement

In [None]:
import pandas as pd
import seaborn as sns
%matplotlib inline
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold

# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

#### load employee data

In [None]:
emp_data = pd.read_csv(".\data.csv", sep='\t')
emp_data.head()

#### check the data type 

In [None]:
emp_data.dtypes

#### check for null data

In [None]:
emp_data.isnull().any()
#No nulls present data is fairly clean
#No need to insert synthetic data for missing values

#### Process Data

In [None]:
#Checking basic health of data for outliers and other issues which can bring noise
emp_data.describe()

In [None]:
## EmployeeCount , StandardHours, Over18 and EmployeeNumber are irrelevant, hence dropping these
emp_data=emp_data.drop(["EmployeeCount","StandardHours","Over18",'EmployeeNumber'],axis=1)

In [None]:
## Encode binary categorical data 
emp_data["Gender"] = emp_data["Gender"].astype('category')
emp_data["OverTime"] = emp_data["OverTime"].astype('category')
emp_data["Attrition"] = emp_data["Attrition"].astype('category')

emp_data["Gender"] = emp_data["Gender"].cat.codes
emp_data["OverTime"] = emp_data["OverTime"].cat.codes
emp_data["Attrition"] = emp_data["Attrition"].cat.codes


#### Plotting Data

In [None]:
# Refining our list of numerical variables
numerical = [u'Age', u'DailyRate',  u'JobSatisfaction',
       u'MonthlyIncome', u'PerformanceRating',
        u'WorkLifeBalance', u'YearsAtCompany', u'Attrition']


g = sns.pairplot(emp_data[numerical], hue='Attrition', palette='seismic', diag_kind = 'kde',diag_kws=dict(shade=True))
g.set(xticklabels=[])

In [None]:
## list down all the features and categorize them
emp_data = pd.get_dummies(emp_data, columns=["MaritalStatus","EducationField"])
emp_data = pd.get_dummies(emp_data, columns=["Department","JobRole"])
personal_features = ['Age','DistanceFromHome','Education','EducationField_Human Resources','EducationField_Life Sciences','EducationField_Marketing','EducationField_Medical','EducationField_Other','EducationField_Technical Degree','Gender','MaritalStatus_Divorced','MaritalStatus_Married','MaritalStatus_Single','NumCompaniesWorked']
money_features = ['DailyRate','HourlyRate','MonthlyIncome','MonthlyRate','PercentSalaryHike','StockOptionLevel']
satisfication_features = ['EnvironmentSatisfaction','JobSatisfaction','RelationshipSatisfaction','WorkLifeBalance']

emp_data = pd.get_dummies(emp_data, columns=["BusinessTravel"])
perks_info_features = ['BusinessTravel_Travel_Rarely','BusinessTravel_Travel_Frequently','BusinessTravel_Non-Travel','JobInvolvement','OverTime','PerformanceRating']

job_role_features = ['JobRole_Healthcare Representative','JobRole_Human Resources','JobRole_Laboratory Technician','JobRole_Manager','JobRole_Manufacturing Director','JobRole_Research Director','JobRole_Research Scientist','JobRole_Sales Executive','JobRole_Sales Representative']
department_features = ['Department_Human Resources','Department_Research & Development','Department_Sales']
employee_work_features = ['JobLevel','TotalWorkingYears','TrainingTimesLastYear','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']
emp_imp_features = job_role_features + department_features + employee_work_features + perks_info_features


In [None]:
## Normalize data (Feature Scaling) #TO bring data in a specific range
#Improves performance and accuracy
emp_data["MonthlyRateLog"] = np.log1p(emp_data["MonthlyRate"])
emp_data["MonthlyIncomeLog"] = np.log1p(emp_data["MonthlyIncome"])
emp_data["HourlyRateLog"] = np.log1p(emp_data["HourlyRate"])
emp_data["DailyRateLog"] = np.log1p(emp_data["DailyRate"])

money_log_features = ['DailyRateLog','HourlyRateLog','MonthlyIncomeLog','MonthlyRateLog','PercentSalaryHike','StockOptionLevel']

all_featues = personal_features + money_log_features + satisfication_features + emp_imp_features + perks_info_features
#TODO: find best features using
#rfc.feature_importances_
best_features = ['MonthlyIncome','Age','DailyRate','OverTime','MonthlyRate','TotalWorkingYears','HourlyRate','YearsAtCompany','YearsWithCurrManager','PercentSalaryHike','NumCompaniesWorked','DistanceFromHome','JobLevel','EnvironmentSatisfaction','RelationshipSatisfaction','JobInvolvement','YearsInCurrentRole','WorkLifeBalance','StockOptionLevel']




#### Define model train and model performance function

In [None]:
def train_model(model,features,dataset):
    # extract features from the dataset
    X_train = dataset[features]
    Y_train = dataset['Attrition']
    
    # spilt the data for training and testing
    X_tr, X_te, Y_tr, Y_te = train_test_split(X_train, Y_train, test_size=0.2)
    # train the model
    model.fit(X_tr, Y_tr)   
    Y_pr = model.predict(X_te)
    accuracy = model.score(X_te,Y_te)
    print("Model accuracy score is ",accuracy)
    return Y_te,Y_pr


def measure_perform_matrix(Y_pr,Y_te):
    confMat = confusion_matrix(Y_pr,Y_te)
    TP=confMat[0][0]
    FP=confMat[0][1]
    FN=confMat[1][0]
    TN=confMat[1][1]

    print("True Positive :",TP)
    print("False Positive :",FP)
    print("False Negative :",FN)
    print("True Negative :",TN)
    
    Acc = (TP + TN)/(TP+FP+FN+TN)
    print(Acc)
    
    Sensitivity = TP/(TP + FN) 
    Specificity = TN/(TN + FP) 

    print("Sensitivity :",Sensitivity)
    print("Specificity :",Specificity)
    
    Precision = TP/(TP+FP)
    invRecall = 1/Sensitivity
    invPrecision = 1/Precision
    F1_Score = 2/(invRecall + invPrecision)
    print("Precision : ",Precision)
    print("F1 Score",F1_Score)   
    
    
    ##Computing false and true positive rates
    fpr, tpr,_=roc_curve(Y_pr,Y_te,drop_intermediate=False)
    plt.figure()
    #plot the ROC curve
    plt.plot(fpr, tpr, color='red',lw=2, label='ROC curve')
    #Adding Random FPR and TPR
    plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
    #Title and label
    plt.xlabel('False Positive Rate')
    plt.ylabel('Total Positive Rate')
    plt.title('ROC curve')
    plt.show()

#Train model 
def train_model_grad(model,features,dataset,val):
    # extract features from the dataset
    X_train = dataset[features]
    Y_train = dataset['Attrition']
    
    # spilt the data for training and testing
    X_tr, X_te, Y_tr, Y_te = train_test_split(X_train, Y_train, test_size=0.2,random_state=val)
    # train the model
    model.fit(X_tr, Y_tr)   
    Y_pr = model.predict(X_te)
    accuracy = model.score(X_te,Y_te) 
    print("Model accuracy score is ",accuracy)
    return Y_te,Y_pr

#### Logistic Regression

In [None]:
lReg = LogisticRegression()
Y_test, Y_predict= train_model_grad(lReg,all_featues,emp_data,26)
measure_perform_matrix(Y_test, Y_predict)

#### Random Forest Model 

In [None]:
forest=RandomForestClassifier(n_estimators=1000,criterion="gini",max_depth=10,n_jobs=-1)
Y_test, Y_predict = train_model_grad(forest,all_featues,emp_data,26)
measure_perform_matrix(Y_test, Y_predict)

#### Neural Network

In [None]:
#TO DO:Bring Scaled Features as input 
neuNet = MLPClassifier(activation='relu', solver='adam', hidden_layer_sizes=(256,), max_iter=500,random_state=26)
#y_te,y_pr = train_model(neuNet,all_featues,emp_data)
y_te, y_pr = train_model_grad(neuNet,all_featues,emp_data,26)
measure_perform_matrix(y_pr,y_te)

#### Gradient boosting classifier

In [None]:
gradBoost = GradientBoostingClassifier(n_estimators=45)
y_te,y_pr = train_model_grad(gradBoost,all_featues,emp_data,26)
measure_perform_matrix(y_pr,y_te)


#### KNeighbors Classifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
y_te,y_pr = train_model_grad(knn,all_featues,emp_data,26)
measure_perform_matrix(y_pr,y_te)

### Hyperparameter Tuning

In [None]:
from sklearn.grid_search import GridSearchCV
#parameters range
params_range=[100,250,500,1000]
params_depth=[2,3,5]
param_grid={'n_estimators':params_range,
           'criterion':['entropy','gini'],
           'max_depth':params_depth}
gridsearch=GridSearchCV(estimator=forest,
                       param_grid=param_grid,
                       scoring='accuracy',
                       cv=5,
                       n_jobs=1)

##
X_train = emp_data[all_featues]
Y_train = emp_data['Attrition']
    
# spilt the data for training and testing
X_tr, X_te, y_tr, y_te = train_test_split(X_train, Y_train, test_size=0.2)
##
gridsearch.fit(X_tr,y_tr)
print(gridsearch.best_score_)
print(gridsearch.best_params_)

#running on the test set
clf=gridsearch.best_estimator_
clf.fit(X_tr,y_tr)
print('Test accuracy : %.3f' %clf.score(X_te,y_te))

svm=gridsearch.best_estimator_
svm.fit(X_tr,y_tr)
print('Test accuracy: %.3f' %svm.score(X_te,y_te))


In [None]:
#seeing how model performs for diff values of neighbors
#searching for optimal value of K in KNN
X_train = emp_data[all_featues]
Y_train = emp_data['Attrition']
k_range= range(1,51)
k_scores=[]
for k in k_range:
    knn=KNeighborsClassifier(n_neighbors=k)
    scores=cross_val_score(knn,X_train,Y_train,cv=10,scoring='accuracy')
    k_scores.append(scores.mean())
print(k_scores)



In [None]:
#Plotting k_range vs k_scores
plt.plot(k_range,k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross Validated Accuracy')

# Summary

## Steps followed to create above models:
* **Data Wrangling** : Data was fairly clean with not specific outliers and missing values.
* **Feature engineering**: Selecting features which help us maximize accuracy. As part of this activity we have:    
> * Dropped some parameters as they were not not relevant and were bringing noise for eg EmployeeCount,StandardHours etc
> * Also we scaled some of the features which had high range (this improved model accuracy and performance)
* **Choosing a model** : We researched about different types of models to select the best one suitable for our problem.
* **Training** : We trained the models which were selected as part of the above exercise. We found *Neural Network* performed the best.
* **Evaluation** : Created custom function to evaluate different performance parameters of a given model for better evaluation
* **Hyperparameter tuning** : On the selected model, we tuned the hyperparameters to get best performance metrics such as ROC curve and accuracy out of the given models
* **Prediction**: The model with the best metrics was selected.


## Model Persistence

In [None]:
import pickle
def load_model(filename,X_test,Y_test):
    loaded_model = pickle.load(open(filename, 'rb'))
    result = loaded_model.score(X_test, Y_test)
    print('Model accuracy is ',result)

In [None]:
#Save model to disk
def save_model(model):
    filename = 'SavedModel.sav'
    pickle.dump(model, open(filename, 'wb'))

In [None]:
save_model(neuNet)

In [None]:
X_tr, X_te, Y_tr, Y_te = train_test_split(X_train, Y_train, test_size=0.2, random_state=26)
filename = 'SavedModel.sav'
X_tr.head()
load_model(filename,X_te,Y_te)