In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import tensorflow
tensorflow.keras.__version__
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import pandas as pd 
import datetime

In [54]:
# import csv
df = pd.read_csv("HR_Mockdata.csv")
# convert date columns to datetime
date_columns = ['hire_date', 'move_date']
for date in date_columns:
    df[date] =  pd.to_datetime(df[date], errors='coerce', dayfirst=True, 
                                                      yearfirst=False, format=None)
# find number of days between hire and move and convert to numerical value
df['delta'] =  (df['move_date'] - df['hire_date']).dt.days
# drop datetime columns
df = df.drop(columns=['move_date', 'hire_date'])
# drop columns not included in analysis
df = df.drop(columns=["sales \nfunction", 'country_home', 'country_host', 'end_date',
                       'left', 'fired_quit', '#', 'time_spend_company'])
# Drop null columns/rows
df = df.dropna(axis='columns', how='all')
df = df.dropna()
df.tail()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,Work_accident,promotion_last_5years,salary,leader_host,culture_home,culture_host,function_host,business Size_host,economic Perspective_host,buzz_bust,delta
14994,0.1,0.9,6,299,0,0,low,consistent,consensual,egalitarian,operations,united states,positive,buzz,4749.0
14995,0.11,0.77,6,247,0,0,medium,consistent,hierarquical,egalitarian,operations,united states,positive,buzz,8036.0
14996,0.5,0.83,7,302,0,0,medium,key talent,egalitarian,egalitarian,operations,united states,positive,buzz,2058.0
14997,0.9,0.94,5,236,0,0,low,consistent,hierarquical,egalitarian,commercial,united states,positive,buzz,1046.0
14998,0.9,0.86,5,257,0,0,low,consistent,consensual,egalitarian,medical,sbu,positive,buzz,1659.0


In [55]:
list(df.columns.values)

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'Work_accident',
 'promotion_last_5years',
 'salary',
 'leader_host',
 'culture_home',
 'culture_host',
 'function_host',
 'business Size_host',
 'economic Perspective_host',
 'buzz_bust',
 'delta']

In [56]:
# select column names that need to be converted to dummy variables
to_dummify = [
 'salary',
 'leader_host',
 'culture_home',
 'culture_host',
 'economic Perspective_host',
 'business Size_host', 
 'function_host'
 ]

In [57]:
for dummy in to_dummify:
    df = pd.concat([df.drop(dummy, axis=1), pd.get_dummies(df[dummy], drop_first=True,
                prefix=dummy)], axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,Work_accident,promotion_last_5years,buzz_bust,delta,salary_low,salary_medium,...,function_host_communication,function_host_compliance,function_host_finance,function_host_human resources,function_host_legal,function_host_market access,function_host_marketing,function_host_medical,function_host_operations,function_host_regulatory affairs
0,0.75,1.0,4,216,0,0,buzz,761.0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0.89,0.85,5,266,0,0,buzz,761.0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,0.41,0.5,2,128,0,0,buzz,761.0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0.45,0.49,2,144,0,0,buzz,761.0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0.1,0.82,7,265,0,0,buzz,761.0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [58]:
X = df.drop("buzz_bust", axis=1)
y = df["buzz_bust"]
print(X.shape, y.shape)
X.head(1)

(13410, 34) (13410,)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,Work_accident,promotion_last_5years,delta,salary_low,salary_medium,leader_host_key talent,...,function_host_communication,function_host_compliance,function_host_finance,function_host_human resources,function_host_legal,function_host_market access,function_host_marketing,function_host_medical,function_host_operations,function_host_regulatory affairs
0,0.75,1.0,4,216,0,0,761.0,1,0,1,...,0,0,0,0,0,0,1,0,0,0


In [59]:

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    random_state=1, stratify=y)

In [60]:
# Create a StandardScater model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

  return self.partial_fit(X, y)


In [61]:
# Transform the training and testing data using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [62]:
# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [77]:
# determine input_dim for model
input_dim = len(list(df.columns.values))-1

# create model
model = Sequential()
model.add(Dense(units=6, activation='relu', input_dim=input_dim))
model.add(Dense(units=6, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [78]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 6)                 210       
_________________________________________________________________
dense_13 (Dense)             (None, 6)                 42        
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 14        
Total params: 266
Trainable params: 266
Non-trainable params: 0
_________________________________________________________________


In [79]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [80]:
# Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100
 - 1s - loss: 0.7024 - acc: 0.6511
Epoch 2/100
 - 0s - loss: 0.5179 - acc: 0.7916
Epoch 3/100
 - 0s - loss: 0.5107 - acc: 0.7916
Epoch 4/100
 - 0s - loss: 0.5020 - acc: 0.7916
Epoch 5/100
 - 0s - loss: 0.4943 - acc: 0.7917
Epoch 6/100
 - 0s - loss: 0.4873 - acc: 0.7921
Epoch 7/100
 - 1s - loss: 0.4803 - acc: 0.7941
Epoch 8/100
 - 0s - loss: 0.4733 - acc: 0.7927
Epoch 9/100
 - 0s - loss: 0.4650 - acc: 0.7932
Epoch 10/100
 - 0s - loss: 0.4554 - acc: 0.7943
Epoch 11/100
 - 0s - loss: 0.4473 - acc: 0.8009
Epoch 12/100
 - 0s - loss: 0.4413 - acc: 0.7998
Epoch 13/100
 - 0s - loss: 0.4358 - acc: 0.8030
Epoch 14/100
 - 0s - loss: 0.4307 - acc: 0.8061
Epoch 15/100
 - 0s - loss: 0.4262 - acc: 0.8063
Epoch 16/100
 - 0s - loss: 0.4216 - acc: 0.8077
Epoch 17/100
 - 0s - loss: 0.4180 - acc: 0.8103
Epoch 18/100
 - 0s - loss: 0.4151 - acc: 0.8060
Epoch 19/100
 - 0s - loss: 0.4118 - acc: 0.8095
Epoch 20/100
 - 1s - loss: 0.4091 - acc: 0.8085
Epoch 21/100
 - 0s - loss: 0.4079 - acc: 0.8086
E

<tensorflow.python.keras.callbacks.History at 0x18b19203ef0>

In [81]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 0s - loss: 0.3610 - acc: 0.8392
Normal Neural Network - Loss: 0.36097921294629876, Accuracy: 0.8392484188079834
