In [1]:
# Import modules
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

In [2]:
#selecting the columns to use
columns_keep =['Age','Attrition',
 'OverTime',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'DailyRate',
 'DistanceFromHome',
 'HourlyRate',
 'MonthlyRate',
 'PercentSalaryHike',
 'TotalWorkingYears',
 'YearsInCurrentRole',
 'YearsAtCompany',
 'EnvironmentSatisfaction',
 'JobSatisfaction',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager',
 'Education', 
 'JobInvolvement',
 'JobLevel',
 'BusinessTravel_Travel_Frequently']

In [3]:
# Reading the data for the model
raw_data = pd.read_csv('../Resources/Data/Employee-Attrition.csv')

# Remove uncecessary columns and code binary ones Yes = 1 | No = 0
raw_data["Attrition"] = raw_data["Attrition"].eq('Yes').mul(1)
raw_data["OverTime"] = raw_data["OverTime"].eq('Yes').mul(1)
raw_data["Gender"] = raw_data["Gender"].eq('Female').mul(1)

# One-hot encode the data using pandas get_dummies
raw_data = pd.get_dummies(raw_data)




In [4]:
raw_data

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,...,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y
0,41,1,1102,1,2,1,1,2,1,94,...,0,0,0,0,1,0,0,0,1,1
1,49,0,279,8,1,1,2,3,0,61,...,0,0,0,1,0,0,0,1,0,1
2,37,1,1373,2,2,1,4,4,0,92,...,0,0,0,0,0,0,0,0,1,1
3,33,0,1392,3,4,1,5,4,1,56,...,0,0,0,1,0,0,0,1,0,1
4,27,0,591,2,1,1,7,1,0,40,...,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,884,23,2,1,2061,3,0,41,...,0,0,0,0,0,0,0,1,0,1
1466,39,0,613,6,1,1,2062,4,0,42,...,0,0,0,0,0,0,0,1,0,1
1467,27,0,155,4,3,1,2064,2,0,87,...,0,1,0,0,0,0,0,1,0,1
1468,49,0,1023,2,3,1,2065,4,0,63,...,0,0,0,0,1,0,0,1,0,1


raw_data.to_csv("raw_data.csv")

In [5]:
# Split tha data
X = raw_data[columns_keep].drop(["Attrition"],axis=1)
X = X.values


y = raw_data['Attrition']
# y = y.astype(int)
y

print(X.shape)
print(y.shape)

(1470, 24)
(1470,)


In [6]:
#creating training and testing data
# first one
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size=0.2, random_state=15)

# Second one
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size = 0.9, test_size=0.1, random_state=15)

print(X_test.shape)


print(y_train.shape)
print(y_test.shape)


(294, 24)
(1058,)
(294,)


In [7]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)

In [8]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_valid_scaled = X_scaler.transform(X_valid)


In [9]:
# X_train_scaled

In [10]:
from tensorflow.keras.utils import to_categorical

In [11]:
# # One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
y_valid_categorical = to_categorical(y_valid)

y_train_categorical

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [12]:
# creating a sequential model
from tensorflow.keras.models import Sequential

model = Sequential()

In [13]:

from tensorflow.keras.layers import Dense
number_inputs = 24
number_hidden_nodes = 100

#creating three layers
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))
model.add(Dense(units=number_hidden_nodes,
                activation='relu'))              
model.add(Dense(units=number_hidden_nodes,
                activation='relu'))

In [14]:
# final layer 
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               2500      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 202       
Total params: 22,902
Trainable params: 22,902
Non-trainable params: 0
_________________________________________________________________


In [16]:
## Compile the Model
# Use categorical crossentropy for categorical data and mean squared error for regression

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [17]:
from keras.callbacks import EarlyStopping

#EarlyStopping function that tracks the val_loss value, stops the training if there are no changes towards val_loss after 3 epochs, and keeps the best weights once the training stops
earlystop = EarlyStopping(monitor = 'val_loss',
                          min_delta = 0,
                          patience = 3,
                          verbose = 1,
                          restore_best_weights = True)

# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=1000,
    shuffle=True,
    verbose=2,
    callbacks=earlystop,
    validation_data=(X_valid_scaled, y_valid_categorical)
)

# model.save("NN_01.hi")


Epoch 1/1000
34/34 - 0s - loss: 0.4662 - accuracy: 0.8138 - val_loss: 0.3804 - val_accuracy: 0.8559
Epoch 2/1000
34/34 - 0s - loss: 0.3630 - accuracy: 0.8412 - val_loss: 0.3625 - val_accuracy: 0.8305
Epoch 3/1000
34/34 - 0s - loss: 0.3182 - accuracy: 0.8847 - val_loss: 0.3736 - val_accuracy: 0.8305
Epoch 4/1000
34/34 - 0s - loss: 0.2909 - accuracy: 0.8904 - val_loss: 0.3712 - val_accuracy: 0.8475
Epoch 5/1000
Restoring model weights from the end of the best epoch.
34/34 - 0s - loss: 0.2726 - accuracy: 0.8922 - val_loss: 0.3891 - val_accuracy: 0.8475
Epoch 00005: early stopping


<tensorflow.python.keras.callbacks.History at 0x18c5d31cdc8>

In [18]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

10/10 - 0s - loss: 0.3725 - accuracy: 0.8707
Loss: 0.37246158719062805, Accuracy: 0.8707482814788818


In [19]:
X_train[0]

array([   49,     0,  4284,     3,  1475,    28,    97, 22710,    20,
          20,     3,     4,     1,     1,     1,     0,     2,     3,
           1,     3,     2,     2,     2,     1], dtype=int64)

In [20]:
import numpy as np
new_data=np.array([[32,0,1500,0,426,17,58,22808,11,8,7,7,2,3,2,1,2,2,1,0,4,3,1,1]])
# [32,1,2741,0,426,17,58,22808,11,8,7,7,2,3,2,1,2,2,1,0,4,3,1,1]

In [21]:
print(f"Predicted class: {model.predict_classes(new_data)}")

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Predicted class: [0]
