## Part 1: Preprocessing

In [43]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [44]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [45]:

# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]]

# Display the first few rows of y_df to confirm
y_df.head()


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [46]:

# Create a list of at least 10 column names to use as X data
# Select 10 columns to use as features for X_df
x_columns = [
    "Age", "DistanceFromHome", "Education", "EnvironmentSatisfaction", 
    "JobInvolvement", "JobSatisfaction", "NumCompaniesWorked", 
    "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole"
]

# Create X_df using your selected columns
X_df = attrition_df[x_columns]

# Show the data types for X_df
X_df.dtypes

# Create X_df using your selected columns


# Show the data types for X_df



Age                        int64
DistanceFromHome           int64
Education                  int64
EnvironmentSatisfaction    int64
JobInvolvement             int64
JobSatisfaction            int64
NumCompaniesWorked         int64
TotalWorkingYears          int64
YearsAtCompany             int64
YearsInCurrentRole         int64
dtype: object

In [47]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_df, y_df, random_state=78)


In [None]:
# Define class weights for Department
department_class_weights = {
    0: 1.0,  # Human Resources
    1: 0.6,  # Research & Development
    2: 1.2   # Sales
}

# Define class weights for Attrition
attrition_class_weights = {
    0: 0.2,  # No
    1: 1.0   # Yes
}

# Combine into a dictionary for model training
class_weights = {
    "Department_Output": department_class_weights,
    "Attrition_Output": attrition_class_weights
}

In [None]:
# Create a StandardScaler
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(x_train)

# Scale the training and testing data
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Display the first few rows of the scaled training data to confirm
print("Scaled Training Data:")
print(x_train_scaled[:5])




Scaled Training Data:
[[-0.40942386  0.87475435  0.10528682 -1.54673288  0.38047011 -1.54172867
   0.92004345 -0.65831229 -0.63814846 -0.590081  ]
 [-1.29889668 -0.50720476 -0.88638904 -0.63425984  1.79218077 -1.54172867
  -0.67349349 -0.65831229 -0.1363124  -0.31430947]
 [-0.07587155  0.49785641  2.08863855 -0.63425984 -1.03124054 -1.54172867
   0.92004345  0.66697113 -0.80542715 -0.590081  ]
 [-0.96534437  0.12095847  0.10528682  0.27821319  0.38047011  0.26186474
   0.12327498 -0.12819892  0.19824498  0.78877664]
 [-1.18771258  1.000387    1.09696269  1.19068623 -2.4429512   0.26186474
  -0.67349349 -1.05589732 -0.63814846 -0.590081  ]]


In [None]:

from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder instance
department_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Fit the encoder to the Department column in the training data (from y_train)
department_encoder.fit(y_train[["Department"]])

# Transform the Department column in the training and testing data
x_train_department_encoded = department_encoder.transform(y_train[["Department"]])
x_test_department_encoded = department_encoder.transform(y_test[["Department"]])

# Convert the encoded arrays into DataFrames with meaningful column names
department_columns = department_encoder.get_feature_names_out(["Department"])
x_train_department_df = pd.DataFrame(x_train_department_encoded, columns=department_columns, index=x_train.index)
x_test_department_df = pd.DataFrame(x_test_department_encoded, columns=department_columns, index=x_test.index)

# Add the encoded Department columns to x_train and x_test
x_train = pd.concat([x_train, x_train_department_df], axis=1)
x_test = pd.concat([x_test, x_test_department_df], axis=1)

# Display the first few rows of the updated x_train to confirm
x_train.head()


Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,NumCompaniesWorked,TotalWorkingYears,YearsAtCompany,YearsInCurrentRole,Department_Human Resources,Department_Research & Development,Department_Sales
591,33,16,3,1,3,1,5,6,3,2,0.0,0.0,1.0
267,25,5,2,2,4,1,1,6,6,3,0.0,1.0,0.0
1236,36,13,5,2,2,1,5,16,2,2,0.0,0.0,1.0
788,28,10,3,3,3,3,3,10,8,7,0.0,1.0,0.0
1224,26,17,4,4,1,3,1,3,3,2,0.0,1.0,0.0


In [None]:
# Create a OneHotEncoder instance for the Attrition column
attrition_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Fit the encoder to the Attrition column in the training data
attrition_encoder.fit(y_train[["Attrition"]])

# Transform the Attrition column in the training and testing data
y_train_attrition_encoded = attrition_encoder.transform(y_train[["Attrition"]])
y_test_attrition_encoded = attrition_encoder.transform(y_test[["Attrition"]])

# Convert the encoded arrays into DataFrames with meaningful column names
attrition_columns = attrition_encoder.get_feature_names_out(["Attrition"])
y_train_attrition_df = pd.DataFrame(y_train_attrition_encoded, columns=attrition_columns, index=y_train.index)
y_test_attrition_df = pd.DataFrame(y_test_attrition_encoded, columns=attrition_columns, index=y_test.index)

# Display the first few rows of the encoded training target to confirm
y_train_attrition_df.head()


Unnamed: 0,Attrition_No,Attrition_Yes
591,0.0,1.0
267,1.0,0.0
1236,0.0,1.0
788,1.0,0.0
1224,1.0,0.0


## Create, Compile, and Train the Model

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input

# Find the number of columns in the X training data
input_dim = x_train.shape[1]

# Create the input layer
input_layer = Input(shape=(input_dim,))

# Create at least two shared layers
shared_layer1 = Dense(64, activation="relu")(input_layer)
shared_layer2 = Dense(32, activation="relu")(shared_layer1)

# Create the shared model so far
shared_model = shared_layer2

In [None]:

# Hidden layer for Department branch
department_hidden_layer = Dense(16, activation="relu")(shared_model)

# Output layer for Department branch (assuming it has 3 categories)
department_output_layer = Dense(3, activation="softmax", name="Department_Output")(department_hidden_layer)


In [None]:

# Hidden layer for Attrition branch
attrition_hidden_layer = Dense(16, activation="relu")(shared_model)

# Output layer for Attrition branch (binary classification: Yes/No)
attrition_output_layer = Dense(2, activation="softmax", name="Attrition_Output")(attrition_hidden_layer)


In [None]:

# Summarize the model
from tensorflow.keras.models import Model

# Create the model with input and both output layers
model = Model(
    inputs=input_layer,
    outputs=[department_output_layer, attrition_output_layer],
    name="Attrition_Model"
)

# Compile the model
model.compile(
    optimizer="adam",
    loss={"Department_Output": "categorical_crossentropy", "Attrition_Output": "categorical_crossentropy"},
    metrics={"Department_Output": "accuracy", "Attrition_Output": "accuracy"}
)

# Summarize the model
model.summary()

In [None]:

# Create a OneHotEncoder instance for the Department column
department_encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Fit the encoder to the Department column in y_train
department_encoder.fit(y_train[["Department"]])

# Transform the Department column for both training and testing data
y_train_department_encoded = department_encoder.transform(y_train[["Department"]])
y_test_department_encoded = department_encoder.transform(y_test[["Department"]])

# Convert the encoded arrays into DataFrames
department_columns = department_encoder.get_feature_names_out(["Department"])
y_train_department_df = pd.DataFrame(y_train_department_encoded, columns=department_columns, index=y_train.index)
y_test_department_df = pd.DataFrame(y_test_department_encoded, columns=department_columns, index=y_test.index)



In [None]:

# Evaluate the model with the testing data
evaluation_results = model.evaluate(
    x=x_test, 
    y={"Department_Output": y_test_department_df, "Attrition_Output": y_test_attrition_df}, 
    verbose=1
)

# Print the results
print("Evaluation Results:")
print(f"Department Loss: {evaluation_results[1]:.4f}")
print(f"Department Accuracy: {evaluation_results[3]:.4f}")
print(f"Attrition Loss: {evaluation_results[2]:.4f}")
print(f"Attrition Accuracy: {evaluation_results[4]:.4f}")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - Attrition_Output_accuracy: 0.7834 - Attrition_Output_loss: 0.6426 - Department_Output_accuracy: 0.6102 - Department_Output_loss: 0.9312 - loss: 1.5780  
Evaluation Results:
Department Loss: 0.9185
Department Accuracy: 0.8098
Attrition Loss: 0.5756
Attrition Accuracy: 0.6168


In [None]:
# Print the accuracy for both department and attrition
# Evaluate the model with the testing data
evaluation_results = model.evaluate(
    x=x_test, 
    y={"Department_Output": y_test_department_df, "Attrition_Output": y_test_attrition_df}, 
    verbose=1
)

# Print the accuracy for both outputs
print("Model Evaluation Results:")
print(f"Overall Loss: {evaluation_results[0]:.4f}")
print(f"Department Accuracy: {evaluation_results[3]:.4f}")  # Accuracy of the Department output
print(f"Attrition Accuracy: {evaluation_results[4]:.4f}")   # Accuracy of the Attrition output

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - Attrition_Output_accuracy: 0.7834 - Attrition_Output_loss: 0.6426 - Department_Output_accuracy: 0.6102 - Department_Output_loss: 0.9312 - loss: 1.5780
Model Evaluation Results:
Overall Loss: 1.5220
Department Accuracy: 0.8098
Attrition Accuracy: 0.6168


In [None]:
print(y_train_department_df.sum(axis=0))
print(y_train_attrition_df.sum(axis=0))

Department_Human Resources            44.0
Department_Research & Development    734.0
Department_Sales                     324.0
dtype: float64
Attrition_No     912.0
Attrition_Yes    190.0
dtype: float64


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. 
2. 
3. 