# 207 Final Project
# Predicting Employee Attrition Using Decision Tree, KNN, and Logistic Regression

Class: 207 Applied Machine Learning | Spring 2023 <br>
Team: Ivy Chan, John Gibbons, Mark Herrera, Maria Manna 

# Import packages & mount drive


In [None]:
# run the below pip install if there's a problem importing heatmap
#!pip install mlxtend --upgrade

In [None]:
# standard libraries
import pandas as pd
import numpy as np
import os
import random

# visualizations
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image
import mlxtend
from mlxtend.plotting import scatterplotmatrix
#from mlxtend.plotting import heatmap
from mlxtend.plotting import plot_decision_regions
import seaborn as sns

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss

# tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import initializers

# display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


random.seed(2)

### Mount drive

In [None]:
#!pip install -U -q PyDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)


Mounted at /content/drive


# Read data

In [None]:
df_original = pd.read_csv("/content/drive/MyDrive/207-MachineLearning/207_final_project/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
df = df_original.copy() # keep the original df

In [None]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], dtype='object')

In [None]:
df.shape
print("Shape of dataset: ", df.shape)


Shape of dataset:  (1470, 35)


# Pre-Processing

### Recode boolean variables

In [None]:
# Change Attrition variable to numeric - 1 for Yes, 0 for No
df.Attrition = np.where(df.Attrition == 'Yes', 1, 0)

In [None]:
# Change OverTime variable to numeric - 1 for Yes, 0 for No
df.OverTime = np.where(df.OverTime == 'Yes', 1, 0)

### Recode ordinal variables

In [None]:
# Change BusinessTravel variable (ordinal) to numeric - 0 for Non-Travel , 1 for Travel_Rarely, 2 for Travel_Frequently
travel_mapping = {'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}
df.BusinessTravel = df.BusinessTravel.map(travel_mapping)

### Add one-hot encodings for nominal variables

In [None]:
# Create one-hot encodings for nominal variables (Gender, Department, EducationField, JobRole)
nominal_features = ['Gender', 'Department', 'EducationField', 'JobRole']

for feature in nominal_features:
  feature_dummies = pd.get_dummies(df[feature])
  feature_dummies.columns = feature_dummies.columns.str.replace(' ', '')
  feature_dummies = feature_dummies.add_prefix(feature.lower())
  df = pd.concat([df, feature_dummies], axis = 1)

print(df.shape)
print(df.columns.sort_values())

(1470, 55)
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'departmentHumanResources', 'departmentResearch&Development', 'departmentSales', 'educationfieldHumanResources', 'educationfieldLifeSciences', 'educationfieldMarketing', 'educationfieldMedical', 'educationfieldOther', 'educationfieldTechnicalDegree', 'genderFemale', 'genderMale', 'jobroleHealthcareRepresentative', 'jobroleHumanResources', 'jobroleLaboratoryTechn

### Create subsets of features of interest

In [None]:
# Define list of all features of interest
all_features = ['Age', 'BusinessTravel', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'JobLevel', 'JobSatisfaction', 
                'MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'StockOptionLevel', 
                'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 
                'departmentHumanResources', 'departmentResearch&Development', 'departmentSales', 'educationfieldHumanResources', 
                'educationfieldLifeSciences', 'educationfieldMarketing', 'educationfieldMedical', 'educationfieldOther', 
                'educationfieldTechnicalDegree', 'genderFemale', 'genderMale', 'jobroleHealthcareRepresentative', 'jobroleHumanResources', 
                'jobroleLaboratoryTechnician', 'jobroleManager', 'jobroleManufacturingDirector', 'jobroleResearchDirector', 'jobroleResearchScientist', 
                'jobroleSalesExecutive', 'jobroleSalesRepresentative']

# Define list of features subset for more readable EDA
features_subset = ['Age', 'BusinessTravel', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'JobLevel', 'JobSatisfaction', 
                   'MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'StockOptionLevel', 
                   'TotalWorkingYears', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [None]:
print(len(all_features))
print(len(features_subset))

39
19


### Train / Test Split

In [None]:
# Define feature and outcome arrays
X = df[all_features].values
y = df.Attrition.values
print("Features shape: ", X.shape)
print("Labels shape: ", y.shape)

Features shape:  (1470, 39)
Labels shape:  (1470,)


In [None]:
# set seed for consistent behavior
np.random.seed(1234)

In [None]:
# Create test/train split arrays
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=1, stratify=y)

In [None]:
# print sizes
print('Size of X_train', X_train.shape)
print('Size of y_train', y_train.shape)
print('Size of X_test', X_test.shape)
print('Size of y_test', y_test.shape)


Size of X_train (1029, 39)
Size of y_train (1029,)
Size of X_test (441, 39)
Size of y_test (441,)


### Print summary statistics before standardizing

In [None]:
df_train_summary = pd.concat(
    [pd.DataFrame(y_train), pd.DataFrame(X_train)],
    axis=1
)

# Get column names
features_summary = all_features.copy()
features_summary.insert(0, 'Attrition')

# Assing column names
df_train_summary.columns = features_summary

# Subset summary df for more digestible output
df_train_summary = df_train_summary[['Attrition', 'Age', 'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked', 
                                     'PercentSalaryHike', 'PerformanceRating', 'WorkLifeBalance', 'YearsSinceLastPromotion']]

df_train_summary.describe().round(3)

Unnamed: 0,Attrition,Age,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,WorkLifeBalance,YearsSinceLastPromotion
count,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0
mean,0.161,36.985,2.756,6517.126,2.723,15.231,3.158,2.737,2.188
std,0.368,9.194,1.095,4658.337,2.532,3.668,0.365,0.71,3.215
min,0.0,18.0,1.0,1052.0,0.0,11.0,3.0,1.0,0.0
25%,0.0,30.0,2.0,2936.0,1.0,12.0,3.0,2.0,0.0
50%,0.0,36.0,3.0,4969.0,2.0,14.0,3.0,3.0,1.0
75%,0.0,43.0,4.0,8381.0,4.0,18.0,3.0,3.0,3.0
max,1.0,60.0,4.0,19999.0,9.0,25.0,4.0,4.0,15.0


### Standardize features

In [None]:
# Standardize features
sc_x = StandardScaler()
X_train_std = sc_x.fit(X_train).transform(X_train)
X_test_std = sc_x.fit(X_train).transform(X_test)

# EDA

### Create EDA dataframe

In [None]:
_# Create train df for EDA
df_train_eda = pd.concat(
    [pd.DataFrame(y_train), pd.DataFrame(X_train_std)],
    axis=1
)

features_eda = all_features.copy()
features_eda.insert(0, 'Attrition')

# Rename columns
df_train_eda.columns = features_eda

df_train_eda.head()

Unnamed: 0,Attrition,Age,BusinessTravel,DistanceFromHome,Education,EnvironmentSatisfaction,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,departmentHumanResources,departmentResearch&Development,departmentSales,educationfieldHumanResources,educationfieldLifeSciences,educationfieldMarketing,educationfieldMedical,educationfieldOther,educationfieldTechnicalDegree,genderFemale,genderMale,jobroleHealthcareRepresentative,jobroleHumanResources,jobroleLaboratoryTechnician,jobroleManager,jobroleManufacturingDirector,jobroleResearchDirector,jobroleResearchScientist,jobroleSalesExecutive,jobroleSalesRepresentative
0,0,-0.977798,-0.147612,2.394813,-1.906415,0.257606,-0.064735,-0.691045,0.006846,-0.680858,-0.599427,-0.335848,-0.433845,1.355373,-0.425315,-1.03861,0.175426,0.502511,-0.369541,0.814259,-0.211353,0.71175,-0.64452,-0.137156,1.201741,-0.340664,-0.679447,-0.261793,-0.311472,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,-0.276333,2.998381,-0.228358,-0.495745,-0.525999,-0.230704
1,0,-0.433696,-0.147612,-0.151915,1.090073,-1.539526,-0.064735,-1.605036,-0.026874,-0.285707,-0.599427,-0.608609,-0.433845,0.203698,-0.425315,-2.448543,-0.814591,-0.619754,-0.058364,-0.592984,-0.211353,-1.404987,1.551543,-0.137156,1.201741,-0.340664,-0.679447,-0.261793,-0.311472,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,-0.276333,-0.333513,-0.228358,-0.495745,1.901144,-0.230704
2,0,0.436868,-0.147612,-1.000824,0.091244,1.156173,2.672746,0.222946,2.72158,-0.680858,-0.599427,-0.881369,-0.433845,0.203698,1.363615,0.371323,2.485464,0.783077,-0.058364,1.658605,4.731423,-1.404987,-0.64452,7.290946,-0.832126,-0.340664,-0.679447,-0.261793,-0.311472,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,3.618824,-0.333513,-0.228358,-0.495745,-0.525999,-0.230704
3,0,-0.324875,-0.147612,-0.151915,-0.907586,-0.64096,-0.064735,0.222946,0.060539,-0.680858,-0.599427,1.573476,2.304969,1.355373,-0.169753,0.371323,0.505431,1.063643,0.563988,1.095708,-0.211353,-1.404987,1.551543,-0.137156,-0.832126,-0.340664,-0.679447,-0.261793,3.21056,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,-0.276333,-0.333513,-0.228358,-0.495745,1.901144,-0.230704
4,0,-0.651337,-2.022835,-0.758279,-0.907586,0.257606,-0.977228,-1.605036,-0.741424,-0.680858,1.668259,-0.608609,-0.433845,0.203698,-0.936437,1.781257,-0.484585,-0.619754,-0.058364,-0.311535,-0.211353,0.71175,-0.64452,-0.137156,-0.832126,-0.340664,1.471786,-0.261793,-0.311472,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,-0.276333,-0.333513,-0.228358,2.017168,-0.525999,-0.230704


### Print summary statistics (standardized)

In [None]:
df_train_eda.describe().round(3)

Unnamed: 0,Attrition,Age,BusinessTravel,DistanceFromHome,Education,EnvironmentSatisfaction,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,departmentHumanResources,departmentResearch&Development,departmentSales,educationfieldHumanResources,educationfieldLifeSciences,educationfieldMarketing,educationfieldMedical,educationfieldOther,educationfieldTechnicalDegree,genderFemale,genderMale,jobroleHealthcareRepresentative,jobroleHumanResources,jobroleLaboratoryTechnician,jobroleManager,jobroleManufacturingDirector,jobroleResearchDirector,jobroleResearchScientist,jobroleSalesExecutive,jobroleSalesRepresentative
count,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0,1029.0
mean,0.161,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,0.0
std,0.368,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,0.0,-2.066,-2.023,-1.001,-1.906,-1.54,-0.977,-1.605,-1.174,-1.076,-0.599,-1.154,-0.434,-0.948,-1.448,-2.449,-1.145,-1.181,-0.681,-1.156,-0.211,-1.405,-0.645,-0.137,-0.832,-0.341,-0.679,-0.262,-0.311,-0.816,-1.226,-0.334,-0.193,-0.467,-0.276,-0.334,-0.228,-0.496,-0.526,-0.231
25%,0.0,-0.76,-0.148,-0.88,-0.908,-0.641,-0.977,-0.691,-0.769,-0.681,-0.599,-0.881,-0.434,-0.948,-0.681,-1.039,-0.65,-0.62,-0.681,-0.593,-0.211,-1.405,-0.645,-0.137,-0.832,-0.341,-0.679,-0.262,-0.311,-0.816,-1.226,-0.334,-0.193,-0.467,-0.276,-0.334,-0.228,-0.496,-0.526,-0.231
50%,0.0,-0.107,-0.148,-0.273,0.091,0.258,-0.065,0.223,-0.332,-0.286,-0.599,-0.336,-0.434,0.204,-0.17,0.371,-0.32,-0.339,-0.37,-0.312,-0.211,0.712,-0.645,-0.137,-0.832,-0.341,-0.679,-0.262,-0.311,-0.816,0.816,-0.334,-0.193,-0.467,-0.276,-0.334,-0.228,-0.496,-0.526,-0.231
75%,0.0,0.655,-0.148,0.697,1.09,1.156,0.848,1.137,0.4,0.505,1.668,0.755,-0.434,0.204,0.469,0.371,0.34,0.783,0.253,0.814,-0.211,0.712,1.552,-0.137,1.202,-0.341,1.472,-0.262,-0.311,1.226,0.816,-0.334,-0.193,-0.467,-0.276,-0.334,-0.228,-0.496,-0.526,-0.231
max,1.0,2.504,1.728,2.395,2.089,1.156,2.673,1.137,2.896,2.48,1.668,2.665,2.305,2.507,3.664,1.781,5.456,3.869,3.987,3.629,4.731,0.712,1.552,7.291,1.202,2.935,1.472,3.82,3.211,1.226,0.816,2.998,5.178,2.143,3.619,2.998,4.379,2.017,1.901,4.335


### Data profiling

pandas_profiling is an excellent tool of visualizing each feature. It generates an HTML file called eda_profile.html. Please open and review the details. From the pandas profile, these four columns have no predicting power:
- EmployeeCount, all values are 1
- EmployeeNumber, should not influence the outcome
- Over18, all values are 1
- StandardHours, all values are 80

Let's consider dropping them.

In [None]:
#!pip install pandas_profiling==3.6.3

In [None]:
# Note that the following EDA profile takes a few minutes to generate
from pandas_profiling import ProfileReport

def generate_pandas_profile_eda():
    """Function to generate html files containing profiles of the data sets for EDA purposes."""

    # Create and save data profile to an html file. Go to google drive and download eda_profile.html to local to view.
    df_profile = ProfileReport(df, minimal=True, title="Employee Attrition Data Report", html={"style": {"full_width": True}})
    df_profile.to_file(output_file="/content/drive/MyDrive/207-MachineLearning/207_final_project/eda_profile.html")

# uncomment to generate pandas profile HTML
# generate_pandas_profile_eda()

### Explore categorical variable distribution

In [None]:
## I didn't include these categorical variables in the df_train_eda dataframe I created, so I'm 
## referencing the entire original dataset. Not sure if that if we are supposed to do that. We can
## cut this entirely if it doesn't add value/is more trouble than it's worth to incorporate
categorical_features = ['Department', 'EducationField', 'JobRole']
for idx, f in enumerate(categorical_features):
    temp_df = df[f'{f}'].value_counts()
    temp_df.columns = [f'{f}', 'Counts']
    temp_df.plot(kind='bar')
    plt.title(f'Employee Counts by {f}')
    plt.show()
    print('-'*45)

---------------------------------------------
---------------------------------------------
---------------------------------------------


### Heatmap

In [None]:
cm = df_train_eda[['Attrition', 'genderFemale', 'genderMale', 'JobSatisfaction', 
                   'MonthlyIncome','NumCompaniesWorked', 'PerformanceRating', 'StockOptionLevel',
                   'WorkLifeBalance', 'YearsSinceLastPromotion',]].corr()

fig, ax = plt.subplots(figsize=(12, 8))

sns.heatmap(cm, 
            cmap='Blues', 
            xticklabels=True, 
            yticklabels=True, 
            annot=True, 
            annot_kws={"fontsize":14},
            cbar=True)
plt.title('Attrition Correlation Heatmap', fontsize=14)
plt.show()

### Calculate baseline log loss & attrition rates from df_train_eda

In [None]:
# Calculate baseline of the training data.
# We use the majority class baseline to classify everyone as no attrition.

y_pred_baseline = np.arange(df_train_eda.shape[0])
y_pred_baseline.fill(0) # 0 is no attrition

baseline_log_loss = log_loss(df_train_eda.Attrition, y_pred_baseline)
print(f"Baseline log loss: {baseline_log_loss}")


Baseline log loss: 5.571853140481218


In [None]:
# Attrition of employees
print(df_train_eda['Attrition'].value_counts(),'\n')

# Total attrition rate
baseline_attrition_rate = (df_train_eda[(df_train_eda['Attrition'] == 1)].shape[0]) / (df_train_eda.shape[0])*100
print("Baseline attrition rate: {:.1f}%.".format(baseline_attrition_rate))

0    863
1    166
Name: Attrition, dtype: int64 

Baseline attrition rate: 16.1%.


# Decision Tree

In [None]:
def information_gain(data, labels, feature, threshold=0):
    ''' Computes information gain
    # param data: an array of featurized examples
    # param labels: an array of labels corresponding to the the data
    # param feature: the feature to use to split the data
    # param threshold: the feature value to use to split the data (the default threshold is good for binary features)
    '''
    # Get the initial entropy of the label distribution.
    initial_entropy = entropy(get_parent_distribution(labels))
    
    # subset0 will contain the labels for which the feature is 0 and
    # subset1 will contain the labels for which the feature is 1.
    subset0, subset1 = [], []
    for datum, label in zip(data, labels):
        if datum[feature] > threshold:
            subset1.append(label)
        else: 
            subset0.append(label)
    
    # Compute the entropy of each subset.
    subset0_entropy = entropy(get_parent_distribution(subset0))
    subset1_entropy = entropy(get_parent_distribution(subset1))
    
    # Compute the final entropy by weighting each subset's entropy according to its size.
    subset0_weight = 1.0 * len(subset0) / len(labels)
    subset1_weight = 1.0 * len(subset1) / len(labels)
    final_entropy = subset0_weight * subset0_entropy + subset1_weight * subset1_entropy
    
    # Finally, compute information gain as the difference between the initial and final entropy.
    return initial_entropy - final_entropy

In [None]:
def entropy(distribution):
    ''' Computes the entropy index
    # param distribution: list containing distributions
    # return entropy index
    '''
    entropy_index = 0.0
    for probability in distribution:
        logprob = -100.0  # log(0) = -inf so let's approximate it with -100 to avoid an error
        if probability > 0.0: 
            logprob = np.log2(probability)
        entropy_index -= probability * logprob
        
    return entropy_index

In [None]:
def get_parent_distribution(labels):
    ''' Computes parent distirbution
    # param labels: << your turn to comment here >>
    # return: <<your turn to comment here >>
    '''
    # Initialize counters for all labels to zero.
    label_probs = np.array([0.0 for i in range(len(np.unique(labels)))])

    # Iterate over labels in the training data and update counts.
    for label in labels:
        label_probs[label] += 1.0
    
    # Normalize to get a distribution.
    label_probs /= label_probs.sum()
    
    return label_probs

In [None]:
binarize_threshold = []

df = df_train_eda.copy()
df_b = df_train_eda.copy()

for feature in features_subset:
    df_b[feature] = np.where(df_b[feature] >= df_b[feature].mean(), 1, 0)
    binarize_threshold.append(df_b[feature].mean())
print('Binarize thresholds:\n', binarize_threshold)
df_b.head()

Binarize thresholds:
 [0.4557823129251701, 0.184645286686103, 0.35471331389698735, 0.6977648202137998, 0.6122448979591837, 0.26822157434402333, 0.6248785228377065, 0.337220602526725, 0.4159378036929057, 0.26433430515063167, 0.3741496598639456, 0.15840621963070942, 0.5840621963070942, 0.3401360544217687, 0.7026239067055393, 0.41690962099125367, 0.37026239067055394, 0.25072886297376096, 0.3663751214771623]


Unnamed: 0,Attrition,Age,BusinessTravel,DistanceFromHome,Education,EnvironmentSatisfaction,JobLevel,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,departmentHumanResources,departmentResearch&Development,departmentSales,educationfieldHumanResources,educationfieldLifeSciences,educationfieldMarketing,educationfieldMedical,educationfieldOther,educationfieldTechnicalDegree,genderFemale,genderMale,jobroleHealthcareRepresentative,jobroleHumanResources,jobroleLaboratoryTechnician,jobroleManager,jobroleManufacturingDirector,jobroleResearchDirector,jobroleResearchScientist,jobroleSalesExecutive,jobroleSalesRepresentative
0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,1,1,0,1,-0.211353,0.71175,-0.64452,-0.137156,1.201741,-0.340664,-0.679447,-0.261793,-0.311472,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,-0.276333,2.998381,-0.228358,-0.495745,-0.525999,-0.230704
1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,-0.211353,-1.404987,1.551543,-0.137156,1.201741,-0.340664,-0.679447,-0.261793,-0.311472,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,-0.276333,-0.333513,-0.228358,-0.495745,1.901144,-0.230704
2,0,1,0,0,1,1,1,1,1,0,0,0,0,1,1,1,1,1,0,1,4.731423,-1.404987,-0.64452,7.290946,-0.832126,-0.340664,-0.679447,-0.261793,-0.311472,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,3.618824,-0.333513,-0.228358,-0.495745,-0.525999,-0.230704
3,0,0,0,0,0,0,0,1,1,0,0,1,1,1,0,1,1,1,1,1,-0.211353,-1.404987,1.551543,-0.137156,-0.832126,-0.340664,-0.679447,-0.261793,3.21056,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,-0.276333,-0.333513,-0.228358,-0.495745,1.901144,-0.230704
4,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,-0.211353,0.71175,-0.64452,-0.137156,-0.832126,-0.340664,1.471786,-0.261793,-0.311472,-0.815505,0.815505,-0.333513,-0.193128,-0.466638,-0.276333,-0.333513,-0.228358,2.017168,-0.525999,-0.230704


In [None]:
len(features_subset)

19

In [None]:
features_subset

['Age',
 'BusinessTravel',
 'DistanceFromHome',
 'Education',
 'EnvironmentSatisfaction',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'OverTime',
 'PercentSalaryHike',
 'PerformanceRating',
 'StockOptionLevel',
 'TotalWorkingYears',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [None]:
X_b = np.array(df_b.iloc[:, 1:])
y_b = np.array(df_b.iloc[:, 0])

In [None]:
for feature in range(len(features_subset)):
    IG = information_gain(X_b, y_b, feature)
    print ('%d %.3f %s' %(feature, IG, features_subset[feature]))

0 0.011 Age
1 0.008 BusinessTravel
2 0.004 DistanceFromHome
3 0.000 Education
4 0.006 EnvironmentSatisfaction
5 0.006 JobLevel
6 0.005 JobSatisfaction
7 0.009 MonthlyIncome
8 0.001 NumCompaniesWorked
9 0.028 OverTime
10 0.000 PercentSalaryHike
11 0.000 PerformanceRating
12 0.033 StockOptionLevel
13 0.012 TotalWorkingYears
14 0.006 WorkLifeBalance
15 0.013 YearsAtCompany
16 0.014 YearsInCurrentRole
17 0.003 YearsSinceLastPromotion
18 0.010 YearsWithCurrManager
