## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow import keras

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
attrition_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   Department                1470 non-null   object
 4   DistanceFromHome          1470 non-null   int64 
 5   Education                 1470 non-null   int64 
 6   EducationField            1470 non-null   object
 7   EnvironmentSatisfaction   1470 non-null   int64 
 8   HourlyRate                1470 non-null   int64 
 9   JobInvolvement            1470 non-null   int64 
 10  JobLevel                  1470 non-null   int64 
 11  JobRole                   1470 non-null   object
 12  JobSatisfaction           1470 non-null   int64 
 13  MaritalStatus             1470 non-null   object
 14  NumCompaniesWorked      

In [3]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [4]:
# Create y_df with the Attrition and Department columns
y = attrition_df[['Attrition', 'Department']]
y.value_counts()

Attrition  Department            
No         Research & Development    828
           Sales                     354
Yes        Research & Development    133
           Sales                      92
No         Human Resources            51
Yes        Human Resources            12
Name: count, dtype: int64

In [5]:
y['Department'].value_counts()

Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64

In [6]:
y['Attrition'].value_counts()

Attrition
No     1233
Yes     237
Name: count, dtype: int64

In [7]:
# Create a list of at least 10 column names to use as X data
X_columns = ['Age', 'BusinessTravel', 'DistanceFromHome', \
             'HourlyRate', 'JobInvolvement', 'JobSatisfaction', \
                'MaritalStatus', 'NumCompaniesWorked', 'PercentSalaryHike',\
                    'PerformanceRating', 'WorkLifeBalance', 'YearsSinceLastPromotion']

# Create X_df using your selected columns
X = attrition_df[X_columns].copy()

# Show the data types for X_df
X


Unnamed: 0,Age,BusinessTravel,DistanceFromHome,HourlyRate,JobInvolvement,JobSatisfaction,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,WorkLifeBalance,YearsSinceLastPromotion
0,41,Travel_Rarely,1,94,3,4,Single,8,11,3,1,0
1,49,Travel_Frequently,8,61,2,2,Married,1,23,4,3,1
2,37,Travel_Rarely,2,92,2,3,Single,6,15,3,3,0
3,33,Travel_Frequently,3,56,3,3,Married,1,11,3,3,3
4,27,Travel_Rarely,2,40,3,2,Married,9,12,3,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,Travel_Frequently,23,41,4,4,Married,4,17,3,3,0
1466,39,Travel_Rarely,6,42,2,1,Married,4,15,3,3,1
1467,27,Travel_Rarely,4,87,4,2,Married,1,20,4,3,0
1468,49,Travel_Frequently,2,63,2,2,Married,2,14,3,2,0


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1470 non-null   int64 
 1   BusinessTravel           1470 non-null   object
 2   DistanceFromHome         1470 non-null   int64 
 3   HourlyRate               1470 non-null   int64 
 4   JobInvolvement           1470 non-null   int64 
 5   JobSatisfaction          1470 non-null   int64 
 6   MaritalStatus            1470 non-null   object
 7   NumCompaniesWorked       1470 non-null   int64 
 8   PercentSalaryHike        1470 non-null   int64 
 9   PerformanceRating        1470 non-null   int64 
 10  WorkLifeBalance          1470 non-null   int64 
 11  YearsSinceLastPromotion  1470 non-null   int64 
dtypes: int64(10), object(2)
memory usage: 137.9+ KB


In [9]:
X[['BusinessTravel', 'MaritalStatus']].nunique()

BusinessTravel    3
MaritalStatus     3
dtype: int64

In [10]:
from sklearn.preprocessing import OneHotEncoder

oh_encoder = OneHotEncoder(sparse_output=False)

oh_df = pd.DataFrame(
    oh_encoder.fit_transform(X[['BusinessTravel', 'MaritalStatus']]), 
    columns=oh_encoder.get_feature_names_out(['BusinessTravel', 'MaritalStatus']))
oh_df

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...
1465,0.0,1.0,0.0,0.0,1.0,0.0
1466,0.0,0.0,1.0,0.0,1.0,0.0
1467,0.0,0.0,1.0,0.0,1.0,0.0
1468,0.0,1.0,0.0,0.0,1.0,0.0


In [11]:
final_X = X.merge(oh_df, left_index=True, right_index=True)
final_X.drop(columns=['BusinessTravel', 'MaritalStatus'], axis=1, inplace=True)
final_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               1470 non-null   int64  
 1   DistanceFromHome                  1470 non-null   int64  
 2   HourlyRate                        1470 non-null   int64  
 3   JobInvolvement                    1470 non-null   int64  
 4   JobSatisfaction                   1470 non-null   int64  
 5   NumCompaniesWorked                1470 non-null   int64  
 6   PercentSalaryHike                 1470 non-null   int64  
 7   PerformanceRating                 1470 non-null   int64  
 8   WorkLifeBalance                   1470 non-null   int64  
 9   YearsSinceLastPromotion           1470 non-null   int64  
 10  BusinessTravel_Non-Travel         1470 non-null   float64
 11  BusinessTravel_Travel_Frequently  1470 non-null   float64
 12  Busine

In [12]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(final_X, y, test_size=0.2)

In [13]:
# Create a StandardScaler
std_scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

In [14]:
# Create a OneHotEncoder for the Department column
d_oneHot_Encoder = OneHotEncoder(sparse=True)

# Fit the encoder to the training data
d_oneHot_Encoder.fit([[yd] for yd in y_train['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
y_dept_train_encoded = d_oneHot_Encoder.transform([[yd] for yd in y_train['Department']]).toarray()
y_dept_test_encoded = d_oneHot_Encoder.transform([[yd] for yd in y_test['Department']]).toarray()



In [15]:
# Create a OneHotEncoder for the Attrition column
a_oneHot_Encoder = OneHotEncoder(sparse=True, drop='first')

# Fit the encoder to the training data
a_oneHot_Encoder.fit([[ya] for ya in y_train['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
y_attr_train_encoded = a_oneHot_Encoder.transform([[ya] for ya in y_train['Attrition']]).toarray()
y_attr_test_encoded = a_oneHot_Encoder.transform([[ya] for ya in y_test['Attrition']]).toarray()




## Create, Compile, and Train the Model

In [16]:
# Find the number of columns in the X training data
input_data = len(final_X.columns)

# Create the input layer
input_layer = keras.layers.Input(shape=(input_data,), name='input_data')

# Create at least two shared layers
s_layer_1 = keras.layers.Dense(64, activation='relu', name='shared_layer1')(input_layer)
s_layer_2 = keras.layers.Dense(64, activation='relu', name='shared_layer2')(s_layer_1)

In [17]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
h_dept_layer = keras.layers.Dense(32, activation='relu', name='h_dept_layer')(s_layer_2)

# Create the output layer
department = keras.layers.Dense(3, activation='softmax', name='department')(h_dept_layer)


In [18]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
h_attr_layer = keras.layers.Dense(32, activation='relu', name='h_attr_layer')(s_layer_2)

# Create the output layer
attrition = keras.layers.Dense(1, activation='sigmoid', name='attrition')(h_attr_layer)

In [19]:
# Create the model
attrition_model = Model(inputs=input_layer, outputs=[department, attrition])

# Compile the model
attrition_model.compile(optimizer='adam', 
                        loss={'department': 'categorical_crossentropy', 'attrition': 'binary_crossentropy'},
                        metrics={'department': 'accuracy',
                       'attrition': 'accuracy'})

# Summarize the model
attrition_model.summary()

In [20]:
# Train the model
attrition_model.fit(X_train_scaled, 
                    {'department': y_dept_train_encoded, 'attrition': y_attr_train_encoded},
                    epochs=100,
                    validation_split=0.2)


Epoch 1/100


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - attrition_accuracy: 0.8064 - department_accuracy: 0.6465 - loss: 1.4833 - val_attrition_accuracy: 0.8814 - val_department_accuracy: 0.6483 - val_loss: 1.1554
Epoch 2/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_accuracy: 0.8251 - department_accuracy: 0.6635 - loss: 1.2270 - val_attrition_accuracy: 0.8814 - val_department_accuracy: 0.6483 - val_loss: 1.1433
Epoch 3/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_accuracy: 0.8177 - department_accuracy: 0.6381 - loss: 1.2252 - val_attrition_accuracy: 0.8814 - val_department_accuracy: 0.6483 - val_loss: 1.1277
Epoch 4/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_accuracy: 0.8434 - department_accuracy: 0.6561 - loss: 1.1289 - val_attrition_accuracy: 0.8814 - val_department_accuracy: 0.6483 - val_loss: 1.1320
Epoch 5/100
[1m30/30[0m [32m━

<keras.src.callbacks.history.History at 0x289ea3fd0>

In [21]:
# Evaluate the model with the testing data
predictions = attrition_model.evaluate(X_test_scaled, 
                    {'department': y_dept_test_encoded, 'attrition': y_attr_test_encoded})

predictions


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 677us/step - attrition_accuracy: 0.7956 - department_accuracy: 0.5076 - loss: 4.7808


[4.5828046798706055, 0.8027210831642151, 0.5136054158210754]

In [22]:
# Print the accuracy for both department and attrition
print(f'Department predictions accuracy: {predictions[2]}')
print(f'Attrition predictions accuracy: {predictions[1]}')

Department predictions accuracy: 0.5136054158210754
Attrition predictions accuracy: 0.8027210831642151


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Yes accuracy is best metric to use, we can get precision too to check how precise the predictions are. 
2. Softmax for department since it is multi category and sigmoid for attrition as it is binary.
3. Add more metrics and get better idea how the predictions are and improve the model based on that. Use different features in X and see if it performs better. Can use keras hyperband tuner to improve attrition accuracy. 