## Part 1: Preprocessing

In [1]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import tensorflow as tf

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [3]:
# Determine the number of unique values in each column.
print(attrition_df.nunique())

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64


In [4]:
# Get the 10 columns for X data with the highest number of unique values
attrition_df.nunique().sort_values(ascending=False)[0:12]

HourlyRate                 71
Age                        43
TotalWorkingYears          40
YearsAtCompany             37
DistanceFromHome           29
YearsInCurrentRole         19
YearsWithCurrManager       18
YearsSinceLastPromotion    16
PercentSalaryHike          15
NumCompaniesWorked         10
JobRole                     9
TrainingTimesLastYear       7
dtype: int64

In [5]:
attrition_df[['JobRole', 'TrainingTimesLastYear']].sample(15)

# Do I want these added? No.
# TrainingTimesLastYear decreased attrition accuracy by 1% and kept the same accuracy on department.

Unnamed: 0,JobRole,TrainingTimesLastYear
800,Laboratory Technician,2
610,Research Director,3
581,Laboratory Technician,2
132,Sales Executive,2
337,Laboratory Technician,2
1031,Sales Executive,1
957,Research Scientist,0
531,Research Director,6
1073,Manufacturing Director,2
1152,Research Scientist,2


In [6]:
# Create y_df with the Attrition and Department columns
y = attrition_df[['Attrition', 'Department']]

In [24]:
# Create a list of at least 10 column names to use as X data
# 'TrainingTimesLastYear', 
columns = ['Age', 'HourlyRate', 'TotalWorkingYears','YearsAtCompany', 'DistanceFromHome', 'YearsInCurrentRole', 'YearsWithCurrManager', 'YearsSinceLastPromotion', 'NumCompaniesWorked', 'PercentSalaryHike']

attrition_df[columns].sample(15)

Unnamed: 0,Age,HourlyRate,TotalWorkingYears,YearsAtCompany,DistanceFromHome,YearsInCurrentRole,YearsWithCurrManager,YearsSinceLastPromotion,NumCompaniesWorked,PercentSalaryHike
317,52,85,11,8,8,2,7,7,2,15
1162,35,55,15,13,10,12,0,6,9,17
334,45,75,12,10,8,9,8,9,9,14
1345,35,44,4,3,16,2,2,0,0,11
504,45,100,5,1,26,1,0,0,2,14
397,25,99,5,5,4,4,3,1,1,11
177,19,47,1,1,2,0,0,1,1,22
238,32,56,6,4,4,3,2,1,2,11
145,30,84,8,3,5,2,2,2,5,14
1446,34,95,8,8,28,7,7,1,1,21


In [25]:
# Create X_df using your selected columns
x_df = attrition_df[columns]

# Show the data types for X_df
x_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Age                      1470 non-null   int64
 1   HourlyRate               1470 non-null   int64
 2   TotalWorkingYears        1470 non-null   int64
 3   YearsAtCompany           1470 non-null   int64
 4   DistanceFromHome         1470 non-null   int64
 5   YearsInCurrentRole       1470 non-null   int64
 6   YearsWithCurrManager     1470 non-null   int64
 7   YearsSinceLastPromotion  1470 non-null   int64
 8   NumCompaniesWorked       1470 non-null   int64
 9   PercentSalaryHike        1470 non-null   int64
dtypes: int64(10)
memory usage: 115.0 KB


In [26]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_df, y, random_state=42)

In [27]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

# All of the data is already in numeric format, so we don't need to convert anything

In [28]:
# Create a StandardScaler
sc = StandardScaler()

# Fit the StandardScaler to the training data
x_train = sc.fit_transform(x_train)

# Scale the training and testing data
x_test = sc.transform(x_test)


In [29]:
# import the OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Department column
ohe = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
ohe.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded_dept = ohe.transform(y_train[['Department']])#.toarray()
y_test_encoded_dept = ohe.transform(y_test[['Department']])#.toarray()

# apply to the y train and test data
# y_train['Department'] = y_train_encoded_dept
# y_test['Department'] = y_test_encoded_dept

y_train['Department'].value_counts()

Department
Research & Development    721
Sales                     336
Human Resources            45
Name: count, dtype: int64

In [30]:
# Create a OneHotEncoder for the Attrition column
ohe = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
ohe.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded_att = ohe.transform(y_train[['Attrition']])#.toarray()
y_test_encoded_att = ohe.transform(y_test[['Attrition']])#.toarray()

# apply to the y train and test data
# y_train['Attrition'] = y_train_encoded_att
# y_test['Attrition'] = y_test_encoded_att

y_train['Attrition'].value_counts()


Attrition
No     913
Yes    189
Name: count, dtype: int64

## Create, Compile, and Train the Model

In [31]:
x_train.shape[1]

10

In [32]:
# Find the number of columns in the X training data
x_col = x_train.shape[1]

# Create the input layer
input_layer = layers.Input(shape=(x_col,), name='Input_layer')

# Create at least two shared layers
dense1_shared = layers.Dense(units=64, activation='relu')(input_layer)
dense2_shared = layers.Dense(units=128, activation='tanh')(dense1_shared)

In [33]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
dept_dense = layers.Dense(32, activation='relu')(dense2_shared)


# Create the output layer
dept_output = layers.Dense(3, activation='softmax', name='dept_output')(dept_dense)


In [34]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attr_dense = layers.Dense(32, activation='relu')(dense2_shared)


# Create the output layer
attr_output = layers.Dense(2, activation='sigmoid', name='attr_output')(dept_dense)


In [35]:
# I want to used a balanced accuracy score for this model
# This was built with ChatGPT from tensorflow.keras import backend as K

# from tensorflow.keras import backend as K

# def balanced_accuracy_score(y_true, y_pred):
#     y_true = K.cast(y_true, 'int32')
#     y_pred = K.cast(K.round(y_pred), 'int32')
#     return tf.py_function(balanced_accuracy_score, (y_true, y_pred), tf.double)

# def balanced_accuracy_score(y_true, y_pred):
#     def sklearn_balanced_accuracy(y_true, y_pred):
#         return balanced_accuracy_score(y_true, y_pred)
#     return tf.py_function(sklearn_balanced_accuracy, (y_true, y_pred), tf.double)

In [36]:
# Create the model
model = Model(inputs=input_layer, outputs=[
    dept_output,
    attr_output
])

# Compile the model
model.compile(optimizer='adam',
    loss={'dept_output': 'categorical_crossentropy', 'attr_output': 'binary_crossentropy'},
    metrics={'dept_output': ['accuracy'], 'attr_output': ['accuracy']}
)

# Compile the model with the balanced accuracy metric
# model.compile(optimizer='adam',
#     loss={'dept_output': 'categorical_crossentropy', 'attr_output': 'categorical_crossentropy'},
#     metrics={'dept_output': [balanced_accuracy], 'attr_output': [balanced_accuracy]}
# )

# Summarize the model
model.summary()

In [37]:
# Train the model
model.fit(x_train, {
    'dept_output': y_train_encoded_dept
    , 'attr_output': y_train_encoded_att
    }
    , epochs=50
    , batch_size=100
    , verbose=1
)


Epoch 1/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - attr_output_accuracy: 0.6216 - dept_output_accuracy: 0.6083 - loss: 1.6219
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attr_output_accuracy: 0.8225 - dept_output_accuracy: 0.6600 - loss: 1.3057 
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attr_output_accuracy: 0.8267 - dept_output_accuracy: 0.6400 - loss: 1.2498 
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attr_output_accuracy: 0.8343 - dept_output_accuracy: 0.6410 - loss: 1.2094 
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attr_output_accuracy: 0.8319 - dept_output_accuracy: 0.6631 - loss: 1.1847 
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attr_output_accuracy: 0.8249 - dept_output_accuracy: 0.6567 - loss: 1.2010 
Epoch 7/50
[1m12/12[0m [32

<keras.src.callbacks.history.History at 0x297fb3efbb0>

In [38]:
# Evaluate the model with the testing data
results = model.evaluate(np.array(x_test), {
        'dept_output': y_test_encoded_dept,
        'attr_output': y_test_encoded_att
    })

# results with 11 fields
## 12/12 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - attr_output_accuracy: 0.8520 - dept_output_accuracy: 0.6363 - loss: 1.2157 
# results with 10 fields
## 12/12 ━━━━━━━━━━━━━━━━━━━━ 0s 1ms/step - attr_output_accuracy: 0.8527 - dept_output_accuracy: 0.6481 - loss: 1.2314 

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attr_output_accuracy: 0.8527 - dept_output_accuracy: 0.6481 - loss: 1.2314  


In [39]:
# Print the accuracy for both department and attrition
pred_categories = ['Attrition', 'Department']

idx_plus_vscode = 1
idx_plus_colab = 5

for i, cat in enumerate(pred_categories):
    print(f"{cat} accuracy: {results[i+idx_plus_vscode]}")

# Results with only the top 10 populated fields

# Results with top 11 fields
# Attrition accuracy: 0.8586956262588501
# Department accuracy: 0.635869562625885
# Results with top 10 fields
# Attrition accuracy: 0.8586956262588501
# Department accuracy: 0.64673912525177

Attrition accuracy: 0.8586956262588501
Department accuracy: 0.64673912525177


In [None]:
# show classification_report for the keras model
from sklearn.metrics import classification_report

# Get the model's predictions
y_pred_dept, y_pred_attr = model.predict(x_test)

y_pred_dept = np.round(y_pred_dept).astype(int)
y_pred_attr = np.round(y_pred_attr).astype(int)

# Generate and print classification reports for each output
print("Department Output Classification Report:")
print(classification_report(y_test['Department'], y_pred_dept))

print("Attribute Output Classification Report:")
print(classification_report(y_test['Attrition'], y_pred_attr))


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Department Output Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97       350
         1.0       0.00      0.00      0.00        18

    accuracy                           0.95       368
   macro avg       0.48      0.50      0.49       368
weighted avg       0.90      0.95      0.93       368

Attribute Output Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        48
         1.0       0.87      1.00      0.93       320

    accuracy                           0.87       368
   macro avg       0.43      0.50      0.47       368
weighted avg       0.76      0.87      0.81       368



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. I wanted to build a balanced accuracy method, but had trouble building it. Accuracy is a decent metric to review but I also looked at the full Classification report metrics to review.
2. I chose relu and tanh as I've seen those work best in the past
3. Hyperparameter tuning could be done, which I didn't incorporate this time. Additionally, I used the fields with the highest number of unique values, rather than reviewing all the fields or trying to fill in blanks on other fields that might have been helpful to the model.