In [63]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
HR_data_df = pd.read_csv('Resources/HR_Employee_Attrition.csv')
HR_data_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [64]:
HR_data_df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [65]:
HR_data_df.drop(['BusinessTravel', 'Department', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'Over18', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'YearsInCurrentRole'], axis="columns", inplace=True)
HR_data_df

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,TotalWorkingYears,YearsAtCompany
0,41,Yes,1102,1,5993,19479,8,Yes,11,8,6
1,49,No,279,8,5130,24907,1,No,23,10,10
2,37,Yes,1373,2,2090,2396,6,Yes,15,7,0
3,33,No,1392,3,2909,23159,1,Yes,11,8,8
4,27,No,591,2,3468,16632,9,No,12,6,2
...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,884,23,2571,12290,4,No,17,17,5
1466,39,No,613,6,9991,21457,4,No,15,9,7
1467,27,No,155,4,6142,5174,1,Yes,20,6,6
1468,49,No,1023,2,5390,13243,2,No,14,17,9


In [66]:
#Convert Overtime column from categorical to numeric
categorical_col_1 = []
for column in HR_data_df.columns:
    if HR_data_df[column].dtype == object and len(HR_data_df[column].unique()) <= 50:
        categorical_col_1.append(column)
        
HR_data_df['OverTime'] = HR_data_df.Attrition.astype("category").cat.codes

In [67]:
#Convert attrition column from categorical to numeric
categorical_col = []
for column in HR_data_df.columns:
    if HR_data_df[column].dtype == object and len(HR_data_df[column].unique()) <= 50:
        categorical_col.append(column)
        
HR_data_df['Attrition'] = HR_data_df.Attrition.astype("category").cat.codes

In [68]:
# Attrition and OvterTime columns are converted from yes/no to 1/0
HR_data_df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,TotalWorkingYears,YearsAtCompany
0,41,1,1102,1,5993,19479,8,1,11,8,6
1,49,0,279,8,5130,24907,1,0,23,10,10
2,37,1,1373,2,2090,2396,6,1,15,7,0
3,33,0,1392,3,2909,23159,1,0,11,8,8
4,27,0,591,2,3468,16632,9,0,12,6,2


In [69]:
# Determine the number of unique values in each column.
HR_data_df.nunique()

Age                     43
Attrition                2
DailyRate              886
DistanceFromHome        29
MonthlyIncome         1349
MonthlyRate           1427
NumCompaniesWorked      10
OverTime                 2
PercentSalaryHike       15
TotalWorkingYears       40
YearsAtCompany          37
dtype: int64

In [70]:
# Look at Age value counts for binning
NCW_counts = HR_data_df['NumCompaniesWorked'].value_counts()
NCW_counts

1    521
0    197
3    159
2    146
4    139
7     74
6     70
5     63
9     52
8     49
Name: NumCompaniesWorked, dtype: int64

In [71]:
# Choose a cutoff value and create a list of application types to be replaced
# use the variable name `NumCompaniesWorked_groups_to_replace`
NCW_groups_to_replace = list(NCW_counts[NCW_counts < 100].index)

# Replace in dataframe
for app in NCW_groups_to_replace:
    HR_data_df['NumCompaniesWorked'] = HR_data_df['NumCompaniesWorked'].replace(app,"Other")

# Check to make sure binning was successful
HR_data_df['NumCompaniesWorked'].value_counts()

1        521
Other    308
0        197
3        159
2        146
4        139
Name: NumCompaniesWorked, dtype: int64

In [72]:
# Look at AGE value counts for binning
Age_counts = HR_data_df['Age'].value_counts()
Age_counts

35    78
34    77
36    69
31    69
29    68
32    61
30    60
33    58
38    58
40    57
37    50
27    48
28    48
42    46
39    42
45    41
41    40
26    39
44    33
46    33
43    32
50    30
25    26
24    26
49    24
47    24
55    22
51    19
53    19
48    19
54    18
52    18
22    16
56    14
23    14
58    14
21    13
20    11
59    10
19     9
18     8
60     5
57     4
Name: Age, dtype: int64

In [73]:
Age_counts[Age_counts > 20]

35    78
34    77
36    69
31    69
29    68
32    61
30    60
33    58
38    58
40    57
37    50
27    48
28    48
42    46
39    42
45    41
41    40
26    39
44    33
46    33
43    32
50    30
25    26
24    26
49    24
47    24
55    22
Name: Age, dtype: int64

In [74]:
# Choose a cutoff value and create a list of age to be replaced
# use the variable name `age_to_replace`
age_to_replace = list(Age_counts[Age_counts < 30].index)

# Replace in dataframe
for cls in age_to_replace:
    HR_data_df['Age'] = HR_data_df['Age'].replace(cls,"Other")

# Check to make sure binning was successful
HR_data_df['Age'].value_counts()

Other    333
35        78
34        77
31        69
36        69
29        68
32        61
30        60
33        58
38        58
40        57
37        50
27        48
28        48
42        46
39        42
45        41
41        40
26        39
44        33
46        33
43        32
50        30
Name: Age, dtype: int64

In [75]:
# Convert categorical data to numeric with `pd.get_dummies`
HR_data_df_new = pd.get_dummies(HR_data_df)

In [76]:
# Split our preprocessed data into our features and target arrays
X = HR_data_df_new.drop(["Attrition"],axis='columns').values
y = HR_data_df_new["Attrition"].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [77]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [78]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=80, activation='relu', input_dim=len(X_train[0])))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=30, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation = 'sigmoid'))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 80)                3040      
                                                                 
 dense_7 (Dense)             (None, 30)                2430      
                                                                 
 dense_8 (Dense)             (None, 1)                 31        
                                                                 
Total params: 5501 (21.49 KB)
Trainable params: 5501 (21.49 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [79]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [80]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [62]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

12/12 - 0s - loss: 6.9401e-05 - accuracy: 1.0000 - 207ms/epoch - 17ms/step
Loss: 6.940117600606754e-05, Accuracy: 1.0
