In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf

# Import our input dataset
df = pd.read_csv('https://static.bc-edx.com/data/dl-1-2/m21/lessons/2/HR-Employee-Attrition.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,Education,JobRole,DistanceFromHome,EnvironmentSatisfaction,...,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,StandardHours,WorkLifeBalance
0,0,35,Yes,Travel_Frequently,502,S,a,l,5,4,...,Divorced,9200,10448,2,Y,Yes,5,1,80,1
1,1,42,No,Non-Travel,497,S,a,l,3,3,...,Single,10719,9049,3,Y,No,7,2,80,4
2,2,34,No,Non-Travel,505,T,e,c,4,3,...,Single,9843,9941,3,Y,No,10,3,80,1
3,3,43,No,Non-Travel,493,T,e,c,5,2,...,Divorced,12077,10150,5,Y,Yes,1,3,80,4
4,4,43,Yes,Travel_Rarely,536,R,e,s,4,4,...,Single,10486,10008,2,Y,No,9,1,80,3


In [2]:
# Generate our categorical variable lists
cat_cols = ['Attrition', 'BusinessTravel','Department', 'Education', 'JobRole', 'Gender', 'MaritalStatus', 'Over18', 'OverTime'] 
df_cat = df[cat_cols]

In [3]:
# Check the number of unique values in each column
df_cat.nunique()

Attrition         2
BusinessTravel    3
Department        3
Education         2
JobRole           3
Gender            2
MaritalStatus     3
Over18            2
OverTime          2
dtype: int64

In [4]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False)  # 'sparse=False' means the output will be a dense array

# Fit and transform the categorical data
encoded_data = encoder.fit_transform(df_cat)



In [5]:
encoder.get_feature_names_out(df_cat.columns)

array(['Attrition_No', 'Attrition_Yes', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_R', 'Department_S', 'Department_T', 'Education_a',
       'Education_e', 'JobRole_c', 'JobRole_l', 'JobRole_s', 'Gender_F',
       'Gender_M', 'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'Over18_N', 'Over18_Y', 'OverTime_No',
       'OverTime_Yes'], dtype=object)

In [6]:
df_cat_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(df_cat.columns))
df_cat_encoded.head()

Unnamed: 0,Attrition_No,Attrition_Yes,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_R,Department_S,Department_T,Education_a,Education_e,...,JobRole_s,Gender_F,Gender_M,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_N,Over18_Y,OverTime_No,OverTime_Yes
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [7]:
df_num = df[[col for col in df.columns if col not in df_cat.columns]]

In [8]:
df2 = pd.concat([df_num, df_cat_encoded], axis=1)
df2.head()

Unnamed: 0.1,Unnamed: 0,Age,DailyRate,DistanceFromHome,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_s,Gender_F,Gender_M,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_N,Over18_Y,OverTime_No,OverTime_Yes
0,0,35,502,5,4,48,4,2,5,9200,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,42,497,3,3,50,4,3,5,10719,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,2,34,505,4,3,46,2,3,5,9843,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,3,43,493,5,2,48,3,1,5,12077,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,4,43,536,4,4,53,2,3,2,10486,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [9]:
# Split our preprocessed data into our features and target arrays
y = df2['Attrition_Yes']
X = df2[[col for col in df2.columns if col not in ['Attrition_Yes', 'Attrition_No']]]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y)
# Split the preprocessed data into a training and testing dataset



In [10]:
# Create a StandardScaler instances
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler
scaler.fit(x_train)

# Scale the data
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [11]:
# Define the model - deep neural net
import tensorflow as tf
model = tf.keras.Sequential()

# First hidden layer
model.add(tf.keras.layers.Dense(8, input_shape=(x_train_scaled.shape[1],), activation='relu'))
model.add(tf.keras.layers.Dense(5, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()




2023-11-05 16:58:12.044357: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2023-11-05 16:58:12.044382: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-11-05 16:58:12.044386: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-11-05 16:58:12.044417: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-05 16:58:12.044434: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 296       
                                                                 
 dense_1 (Dense)             (None, 5)                 45        
                                                                 
 dense_2 (Dense)             (None, 1)                 6         
                                                                 
Total params: 347 (1.36 KB)
Trainable params: 347 (1.36 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [13]:
# Train the model
model.fit(x_train_scaled, y_train, epochs=100)


Epoch 1/100


2023-11-05 16:58:13.082019: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x282332a40>

In [14]:
# Evaluate the model using the test data
loss, accuracy = model.evaluate(x_test_scaled, y_test)
print(f'loss = {loss}, accuracy = {accuracy}')

loss = 0.49324527382850647, accuracy = 0.8209366202354431
