In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import tensorflow
tensorflow.keras.__version__
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import pandas as pd 
import datetime

In [2]:
# import csv
df = pd.read_csv("HR_Mockdata.csv")
# convert date columns to datetime
date_columns = ['hire_date', 'move_date']
for date in date_columns:
    df[date] =  pd.to_datetime(df[date], errors='coerce', dayfirst=True, 
                                                      yearfirst=False, format=None)
# find number of days between hire and move and convert to numerical value
df['delta'] =  (df['move_date'] - df['hire_date']).dt.days
# drop datetime columns
df = df.drop(columns=['move_date', 'hire_date'])
# drop columns not included in analysis
df = df.drop(columns=[ 'country_home', 'country_host', 'end_date',
                        'fired_quit'])
# Drop null columns/rows
df = df.dropna(axis='columns', how='all')
df = df.dropna()
df.tail()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,Work_accident,promotion_last_5years,salary,leader_host,culture_home,culture_host,function_host,business Size_host,economic Perspective_host,buzz_bust,delta
14994,0.5,0.57,2,151,0,0,low,key talent,egalitarian,egalitarian,finance,united states,positive,buzz,1687.0
14995,0.37,0.48,2,160,0,0,low,consistent,consensual,egalitarian,commercial,united states,positive,buzz,2466.0
14996,0.37,0.53,2,143,0,0,low,consistent,egalitarian,egalitarian,finance,sbu,positive,buzz,1120.0
14997,0.5,0.96,6,280,0,0,low,key talent,hierarquical,egalitarian,operations,united states,positive,buzz,2335.0
14998,0.37,0.52,2,158,0,0,low,consistent,egalitarian,egalitarian,commercial,united states,positive,buzz,2296.0


In [3]:
list(df.columns.values)

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'Work_accident',
 'promotion_last_5years',
 'salary',
 'leader_host',
 'culture_home',
 'culture_host',
 'function_host',
 'business Size_host',
 'economic Perspective_host',
 'buzz_bust',
 'delta']

In [4]:
# select column names that need to be converted to dummy variables
to_dummify = [
 'salary',
 'leader_host',
 'culture_home',
 'culture_host',
 'economic Perspective_host',
 'business Size_host', 
 'function_host'
 ]

In [5]:
for dummy in to_dummify:
    df = pd.concat([df.drop(dummy, axis=1), pd.get_dummies(df[dummy], drop_first=True,
                prefix=dummy)], axis=1)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,Work_accident,promotion_last_5years,buzz_bust,delta,salary_low,salary_medium,...,function_host_communication,function_host_compliance,function_host_finance,function_host_human resources,function_host_legal,function_host_market access,function_host_marketing,function_host_medical,function_host_operations,function_host_regulatory affairs
0,0.38,0.53,2,157,0,0,bust,5175.0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0.9,0.86,5,262,0,0,bust,3534.0,0,1,...,0,0,1,0,0,0,0,0,0,0
2,0.11,0.88,7,272,0,0,bust,3098.0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0.75,0.87,5,223,0,0,bust,725.0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0.37,0.52,2,159,0,0,bust,4599.0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X = df.drop("buzz_bust", axis=1)
y = df["buzz_bust"]
print(X.shape, y.shape)
X.head(1)

(13410, 34) (13410,)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,Work_accident,promotion_last_5years,delta,salary_low,salary_medium,leader_host_key talent,...,function_host_communication,function_host_compliance,function_host_finance,function_host_human resources,function_host_legal,function_host_market access,function_host_marketing,function_host_medical,function_host_operations,function_host_regulatory affairs
0,0.38,0.53,2,157,0,0,5175.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    random_state=1, stratify=y)

In [8]:
# Create a StandardScater model and fit it to the training data
X_scaler = MinMaxScaler().fit(X_train)

  return self.partial_fit(X, y)


In [9]:
# Transform the training and testing data using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [11]:
# determine input_dim for model
input_dim = len(list(df.columns.values))-1

# create model
model = Sequential()
model.add(Dense(units=6, activation='relu', input_dim=input_dim))
model.add(Dense(units=6, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 6)                 210       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 42        
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 14        
Total params: 266
Trainable params: 266
Non-trainable params: 0
_________________________________________________________________


In [13]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [14]:
# Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 1s - loss: 0.5492 - acc: 0.7552
Epoch 2/100
 - 0s - loss: 0.5081 - acc: 0.7916
Epoch 3/100
 - 0s - loss: 0.5015 - acc: 0.7916
Epoch 4/100
 - 0s - loss: 0.4944 - acc: 0.7916
Epoch 5/100
 - 0s - loss: 0.4875 - acc: 0.7915
Epoch 6/100
 - 0s - loss: 0.4794 - acc: 0.7913
Epoch 7/100
 - 1s - loss: 0.4714 - acc: 0.7912
Epoch 8/100
 - 1s - loss: 0.4645 - acc: 0.7915
Epoch 9/100
 - 0s - loss: 0.4585 - acc: 0.7925
Epoch 10/100
 - 0s - loss: 0.4528 - acc: 0.7946
Epoch 11/100
 - 0s - loss: 0.4483 - acc: 0.7966
Epoch 12/100
 - 0s - loss: 0.4432 - acc: 0.8003
Epoch 13/100
 - 0s - loss: 0.4390 - acc: 0.8023
Epoch 14/100
 - 0s - loss: 0.4332 - acc: 0.8076
Epoch 15/100
 - 0s - loss: 0.4287 - acc: 0.8092
Epoch 16/100
 - 0s - loss: 0.4236 - acc: 0.8122
Epoch 17/100
 - 0s - loss: 0.4196 - acc: 0.8110
Epoch 18/100
 - 0s - loss: 0.4159 - acc: 0.8143
Epoch 19/100
 - 0s - loss: 0.4116 - acc: 0.8171
Epoch 20/100
 - 1s - loss: 0.4091 - acc: 0.8150
E

<tensorflow.python.keras.callbacks.History at 0x153be970b70>

In [18]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

 - 0s - loss: 0.3628 - acc: 0.8258
Normal Neural Network - Loss: 0.36284984953539284, Accuracy: 0.8258275985717773


In [25]:
# Save the model
model.save("Output/ML_NeuralNet_WhyExpat.h5")