In [450]:
# Contributed by Jun Yan Chen & Batuhan Kir

#  Python 3 environment 
import matplotlib.pyplot as plot
import seaborn as sb
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
df = pd.read_csv("../input/employee-future-prediction/Employee.csv") 
print (df.head())


# Check Missing Data

In [451]:
df.isnull().sum()

****No missing value in the dataset.

# Brief summary about each columns in the dataset.

1. Education: Education level of the employee.                  
2. Joining Year: The year when the employee joined the company.
3. City: Office location where the information was posted.      
4. PaymentTier: The employee's level of payment                 
5. Age: The age of the employee.                                
6. Gender: Gender of the employee.                              
7. EverBenched: Indicates whether the employee experienced being benched. (Not active on any project)             
8. ExperienceInCurrentDomain: Number of years of the employee's experience in current field. 
9. LeaveOrNot: Whether the employee left the company within 2 years. 

# Attributes vs. LeaveOrNot

In [452]:
education_vs_leave = df.groupby(['Education', 'LeaveOrNot']).size().reset_index(name = 'Count')
plot.figure(figsize = (20,10))
sb.barplot(x = 'Education', y = 'Count', hue = 'LeaveOrNot', data = education_vs_leave)
plot.title("Education VS. Leave or Not")
plot.show()

There are more Bachelors in the dataset over all and larger portion of all three categories did not leave in 2 years, the amount of samples with Masters education that chose to leave is higher than Bachelors and PhD.

In [453]:
joining_vs_leave = df.groupby(['JoiningYear', 'LeaveOrNot']).size().reset_index(name = 'Count')
plot.figure(figsize = (20,10))
sb.barplot(x = 'JoiningYear', y = 'Count', hue = 'LeaveOrNot', data = joining_vs_leave)
plot.title("Joining Year VS. Leave or Not")
plot.show()

During year 2012-2017, majority did not leave within 2 years, the result is reverse in 2018.

In [454]:
city_vs_leave = df.groupby(['City', 'LeaveOrNot']).size().reset_index(name = 'Count')
plot.figure(figsize = (20,10))
sb.barplot(x = 'City', y = 'Count', hue = 'LeaveOrNot', data = city_vs_leave)
plot.title("City VS. Leave or Not")
plot.show()

Sample with city Pune who left within 2 years is slightly higher than who stayed.
More samples in Bangalore and New Delhi chose to stay.

In [455]:
tier_vs_leave = df.groupby(['PaymentTier', 'LeaveOrNot']).size().reset_index(name = 'Count')
plot.figure(figsize = (20,10))
sb.barplot(x = 'PaymentTier', y = 'Count', hue = 'LeaveOrNot', data = tier_vs_leave)
plot.title("Payment Tier VS. Leave or Not")
plot.show()

Samples with PaymentTier 3 has the highest distribution. The amount of payment is ranked as 1>2>3, payment tier with lower value means higher pay.

In [456]:
age_vs_leave = df.groupby(['Age', 'LeaveOrNot']).size().reset_index(name = 'Count')
plot.figure(figsize = (20,10))
sb.barplot(x = 'Age', y = 'Count', hue = 'LeaveOrNot', data = age_vs_leave)
plot.title("Age VS. Leave or Not")
plot.show()

Age ranged from 22 to 41. Large amount of samples are in the range 24-30. Age 22 and 23 have the smallest amount of samples overall.

In [457]:
gender_vs_leave = df.groupby(['Gender', 'LeaveOrNot']).size().reset_index(name = 'Count')
plot.figure(figsize = (20,10))
sb.barplot(x = 'Gender', y = 'Count', hue = 'LeaveOrNot', data = gender_vs_leave)
plot.title("Gender VS. Leave or Not")
plot.show()

There are more males than females in the dataset, males who stayed have a larger distribution than males who left. Females have a fairly close distribution between staying and leaving.

In [458]:
benched_vs_leave = df.groupby(['EverBenched', 'LeaveOrNot']).size().reset_index(name = 'Count')
plot.figure(figsize = (20,10))
sb.barplot(x = 'EverBenched', y = 'Count', hue = 'LeaveOrNot', data = benched_vs_leave)
plot.title("Benched VS. Leave or Not")
plot.show()

The majority of the people in the data had never been benched, most stayed. People who was benched before have closer count between stay and leave.

In [459]:
experience_vs_leave = df.groupby(['ExperienceInCurrentDomain', 'LeaveOrNot']).size().reset_index(name = 'Count')
plot.figure(figsize = (20,10))
sb.barplot(x = 'ExperienceInCurrentDomain', y = 'Count', hue = 'LeaveOrNot', data = experience_vs_leave)
plot.title("Experience VS. Leave or Not")
plot.show()

More people chose to stay with any years of experience, the data has less samples on 6 and 7 years of experience.

In [460]:
plot.figure(figsize = (10,10))
plot.title('Distribution whether a employee will leave or not')
tempdf = df['LeaveOrNot']
df['LeaveOrNot'] = df['LeaveOrNot'].map({1:'Leave' ,0:'NotLeave' })
sb.countplot(x = df['LeaveOrNot'])
plot.show()
df['LeaveOrNot'] = df['LeaveOrNot'].map({'Leave':1 ,'NotLeave':0 })  #reverting back

The count is higher in the data for people who chose to not leave than those who chose to leave.

# Data Normalization

In [461]:
# change categorical data to numerical values
data = pd.DataFrame(df)
data['Education'] = data['Education'].astype('category')
data['Education'] = data['Education'].cat.codes
data['City'] = data['City'].astype('category')
data['City'] = data['City'].cat.codes
data['Gender'] = data['Gender'].astype('category')
data['Gender'] = data['Gender'].cat.codes
data['EverBenched'] = data['EverBenched'].astype('category')
data['EverBenched'] = data['EverBenched'].cat.codes

data.head()

Dataset after all categorical String data are converted to numerical values.
* Education: 0:Bachelors, 1:Masters, 2:PhD
* City: 0：Bangalore, 1:New Delhi, 2:Pune
* Gender: 0:Female, 1:Male
* EverBenched: 0:No, 1:Yes

In [462]:
sb.scatterplot(x=data['Education'], y=data['City'], hue=data['LeaveOrNot'])

Scatterplots are not good visual representation for most of the features in this datasets becasue most features are categorical, meaning limited range and variations.

# Classification Models

In [463]:
#classification imports
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

Confusion Matrix:

> True Positive: The classifier predicted NotLeave and the passenger actually NotLeave.
> 
> True Negative: The classifier predicted Leave and the passenger actually Leave.
> 
> False Postiive: The classifier predicted NotLeave but the passenger actually Leave.
> 
> False Negative: The classifier predicted Leave but the passenger actually NotLeave.

# Stratified K-Fold split

Stratified K-Fold split separate the dataset into 5 different folds, each fold will contain 1/5 of the total data. The data distributions in each fold will remain close to another fold. For example, it will not exist a situation where fold 1 contains 100 Masters while fold 2 only contains 3 Masters. The amount of data are not splited to have exactly equal amount in each fold but they will have close counts.


In [464]:
X = data.drop(columns = 'LeaveOrNot')
y = data['LeaveOrNot']
# Stratified K-Fold Cross Validation  k=5
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# Random Forest

In [465]:
# Random Forest with Stratified K-Fold CV
rdForest = RandomForestClassifier()
score = cross_val_score(rdForest, X, y, scoring='accuracy', cv=cv, n_jobs=-1) #n_jobs=number of jobs running parallel
print(score)
print('Accuracy: %.3f (%.3f)' % (score.mean(), score.std()))
y_pred = cross_val_predict(rdForest, X, y, cv=cv)
# Random Forest Confusion Matrix
conf_mat = confusion_matrix(y, y_pred)
print('Confusion Matrix: ')
rfmean = round(score.mean(),3)
rfstd = round(score.std(),3)
plot.figure(figsize=(9,9))
sb.heatmap(conf_mat, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plot.ylabel('Actual label');
plot.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(rfmean)
plot.title(all_sample_title, size = 15);

# Stratified K-Fold CV Feature Importances

Feature Importances: Each feature rated in the range of 0 to 1 based on how much the model depends on that feature. The feature with the highest score shows that it is the most important feature for the model to consider when making a prediction. All values add up to 1.

In [466]:
# Stratified K-Fold CV Feature Importances with Random Forest
from sklearn.datasets import make_classification
feature_names = list(X.columns)
# classification dataset
data_x, data_y = make_classification(n_features=9)
count = 1
for train, _ in cv.split(data_x, data_y):
    rdForest.fit(data_x[train, :], data_y[train])
    importances_index_desc = np.argsort(rdForest.feature_importances_)[::-1]
    feature_labels = [feature_names[-i] for i in importances_index_desc]

    # plot
    plot.figure()
    plot.bar(feature_labels, rdForest.feature_importances_[importances_index_desc])
    plot.xticks(feature_labels, rotation='vertical')
    plot.ylabel('Importance')
    plot.xlabel('Features')
    plot.title('Fold {}'.format(count))
    count = count + 1
plot.show()

# Regular K-Fold CV Feature Importances

Regular K-Fold is also tested to show the differences. Both Regular and Stratified K-fold methods use the Random Forest model. Unlike the Stratified method, the regular K-fold method split the data totally random, meaning it would not take consideration on how balanced the dataset is. Each fold will have a random distribution of data. It is possible for regular K-fold split to have all the samples with PhD in fold 1 and no sample with PhD in fold 2 at all. This can cause the problem of overfitting, but using random forest on this method is able to improve this to a degree.

In [467]:
# Regular K-Fold CV Feature Importances with Random Forest
cvn = KFold(n_splits=5, random_state=42, shuffle=True)
rdForest = RandomForestClassifier()
score = cross_val_score(rdForest, X, y, scoring='accuracy', cv=cvn, n_jobs=-1)
print(score)
print('Accuracy: %.3f (%.3f)' % (score.mean(), score.std()))
y_pred = cross_val_predict(rdForest, X, y, cv=cvn)

# Confusion Matrix
conf_mat = confusion_matrix(y, y_pred)
print('Confusion Matrix: ')
plot.figure(figsize=(9,9))
sb.heatmap(conf_mat, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plot.ylabel('Actual label');
plot.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score.mean())
plot.title(all_sample_title, size = 15);

feature_names = list(X.columns)
# classification dataset
data_x, data_y = make_classification(n_features=9)
count = 1
for train, _ in cvn.split(data_x, data_y):
    rdForest.fit(data_x[train, :], data_y[train])
    importances_index_desc = np.argsort(rdForest.feature_importances_)[::-1]
    feature_labels = [feature_names[-i] for i in importances_index_desc]

    # plot
    plot.figure()
    plot.bar(feature_labels, rdForest.feature_importances_[importances_index_desc])
    plot.xticks(feature_labels, rotation='vertical')
    plot.ylabel('Importance')
    plot.xlabel('Features')
    plot.title('Fold {}'.format(count))
    count = count + 1
plot.show()

As a result, the most important feature for Stratified K-fold split with most likely be the same for every fold because the data was split with a pre condition that all folds will have similar distributions. While for regular K-fold, the most important feature can sometimes varies between folds.

# Decision Tree

In [468]:
# DTree with Stratified K-Fold CV
dTree = DecisionTreeClassifier()
score = cross_val_score(dTree, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(score)
print('Accuracy: %.3f (%.3f)' % (score.mean(), score.std()))
y_pred = cross_val_predict(dTree, X, y, cv=cv)
#Confusion Matrix
conf_mat = confusion_matrix(y, y_pred)
print('Confusion Matrix: ')
dtmean = round(score.mean(),3)
dtstd = round(score.std(),3)
plot.figure(figsize=(9,9))
sb.heatmap(conf_mat, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plot.ylabel('Actual label');
plot.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(dtmean)
plot.title(all_sample_title, size = 15);

# Gaussian Naive Bayes

In [469]:
# Gaussian Naive Bayes with Stratified K-Fold CV
naiveB = GaussianNB()
score = cross_val_score(naiveB, X, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(score)
print('Accuracy: %.3f (%.3f)' % (score.mean(), score.std()))
y_pred = cross_val_predict(naiveB, X, y, cv=cv)
#Confusion Matrix
conf_mat = confusion_matrix(y, y_pred)
print('Confusion Matrix: ')
nbmean = round(score.mean(),3)
nbstd = round(score.std(),3)
plot.figure(figsize=(9,9))
sb.heatmap(conf_mat, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plot.ylabel('Actual label');
plot.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(nbmean)
plot.title(all_sample_title, size = 15);

// Didn't use linear regression because we are dealing wth categorical data.

# Logistic Regression

In [470]:
# Logistic Regresssion with Stratified K-Fold CV
logR = LogisticRegression(solver='liblinear')
score = cross_val_score(logR, X, y, scoring='accuracy', cv=cv, n_jobs=-1) 
print(score)
print('Accuracy: %.3f (%.3f)' % (score.mean(), score.std()))
y_pred = cross_val_predict(logR, X, y, cv=cv)
#Confusion Matrix
conf_mat = confusion_matrix(y, y_pred)
print('Confusion Matrix: ')
lrmean = round(score.mean(),3)
lrstd = round(score.std(),3)
plot.figure(figsize=(9,9))
sb.heatmap(conf_mat, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plot.ylabel('Actual label');
plot.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(lrmean)
plot.title(all_sample_title, size = 15);

# Results

In [471]:
# Table labeling all models used 
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gaussian Naive Bayes'],
    
    'Score': [lrmean, dtmean, rfmean,  nbmean],
    'SD': [lrstd, dtstd, rfstd,  nbstd]
    })

models.sort_values(by='Score', ascending=False)

Models are ranked above based on their accuracy scores

Random Forest has the highest accuracy score followed by Decision Tree.

1. Logistic regression is expected to have better performance when the dataset is balanced and mostly only contains binary dependent variable(variable that presents two outcome, 0 and 1).
2. Gaussian Naive Bayes assumes the dataset is normally distributed when predicting and it would have better performance if all features in the data are continues, this contrast from our dataset which is strongly unbalanced and where most features are discrete and categorical.
3. Decision Tree has a good performance on the dataset due to a small amount of columns, which means it has a fairly small depth. When a decision tree has more depth, it would increase the complexity for the model, a higher complexity can easily result in overfitting when using decision tree classifier.
4. Random forest is a combination of many decision tree. The prediction results are based on the maximum vote among the trained trees. It further reduces the chance of overfitting. When combining with Stratified K-Fold Cross Validation, it does not necessarily increase the accuracy of the model, but the precision will be greatly increased.
