## Part 1: Preprocessing

In [42]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [43]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [44]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]


In [45]:
# Create a list of at least 10 column names to use as X data
X_columns = ['Age', 'YearsAtCompany', 'JobSatisfaction', 'TotalWorkingYears', 'YearsInCurrentRole', 'MaritalStatus', 'OverTime', 'JobLevel', 'DistanceFromHome', 'Education', 'JobRole']


# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
print(X_df.dtypes)


Age                    int64
YearsAtCompany         int64
JobSatisfaction        int64
TotalWorkingYears      int64
YearsInCurrentRole     int64
MaritalStatus         object
OverTime              object
JobLevel               int64
DistanceFromHome       int64
Education              int64
JobRole               object
dtype: object


In [46]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)


In [47]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

In [48]:
# Import necessary libraries
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder with handle_unknown='ignore'
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse=False for dense output

# Fit and transform the training data
X_train_encoded = ohe.fit_transform(X_train[['YearsAtCompany']])

# Transform the testing data
X_test_encoded = ohe.transform(X_test[['YearsAtCompany']])

In [49]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

# Identify numerical and categorical features
numerical_features = X_train.select_dtypes(include=['number']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Create transformers for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparse=False for dense output

# Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Fit and transform the training data
X_train_scaled = preprocessor.fit_transform(X_train)

# Transform the testing data
X_test_scaled = preprocessor.transform(X_test)

# Convert the scaled data back to DataFrames if needed
X_train_scaled = pd.DataFrame(X_train_scaled, columns=preprocessor.get_feature_names_out())
X_test_scaled = pd.DataFrame(X_test_scaled, columns=preprocessor.get_feature_names_out())

In [50]:
# Create a OneHotEncoder for the Department column
from sklearn.preprocessing import OneHotEncoder

department_ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


# Fit the encoder to the training data
y_train_department = department_ohe.fit_transform(y_train[['Department']])
y_test_department = department_ohe.transform(y_test[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data




In [51]:
# Create a OneHotEncoder for the Attrition column
from sklearn.preprocessing import LabelEncoder

attrition_le = LabelEncoder()


# Fit the encoder to the training data
y_train_attrition = attrition_le.fit_transform(y_train['Attrition'])
y_test_attrition = attrition_le.transform(y_test['Attrition'])


# Create two new variables by applying the encoder
# to the training and testing data



## Create, Compile, and Train the Model

In [52]:
# Find the number of columns in the X training data
input_dim = X_train_scaled.shape[1]

# Create the input layer
from tensorflow.keras.layers import Input

input_layer = Input(shape=(input_dim,))

from tensorflow.keras.models import Model

# Create at least two shared layers
from tensorflow.keras.layers import Dense

shared_layer1 = Dense(units=64, activation='relu')(input_layer)
shared_layer2 = Dense(units=32, activation='relu')(shared_layer1)

In [53]:
# Create a branch for Department
# with a hidden layer and an output layer
num_departments = y_train_department.shape[1]

dept_branch = Dense(16, activation='relu')(shared_layer2)
dept_output = Dense(num_departments, activation='softmax', name='department_output')(dept_branch)

# Create the hidden layer
# Create the output layer



In [54]:
# Create a branch for Attrition
# with a hidden layer and an output layer
num_attrition = 1

attrition_branch = Dense(16, activation='relu')(shared_layer2)
attrition_output = Dense(1, activation='sigmoid', name='attrition_output')(attrition_branch)

# Create the hidden layer
# Create the output layer



In [55]:
# Create the model
model = Model(inputs=input_layer, outputs=[dept_output, attrition_output])


# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'department_output': 'categorical_crossentropy',
        'attrition_output': 'binary_crossentropy'
    },
    metrics={
        'department_output': 'accuracy',
        'attrition_output': 'accuracy'
    }
)


# Summarize the model
model.summary()

In [56]:
# Train the model
history = model.fit(
    X_train_scaled,
    [y_train_department, y_train_attrition],
    epochs=10,
    batch_size=32,
    validation_data=(X_test_scaled, [y_test_department, y_test_attrition])
)


Epoch 1/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - attrition_output_accuracy: 0.8325 - department_output_accuracy: 0.5711 - loss: 1.4016 - val_attrition_output_accuracy: 0.8673 - val_department_output_accuracy: 0.6769 - val_loss: 1.0548
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8310 - department_output_accuracy: 0.7496 - loss: 1.0346 - val_attrition_output_accuracy: 0.8673 - val_department_output_accuracy: 0.9116 - val_loss: 0.8395
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8217 - department_output_accuracy: 0.9034 - loss: 0.8179 - val_attrition_output_accuracy: 0.8673 - val_department_output_accuracy: 0.9082 - val_loss: 0.6236
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8418 - department_output_accuracy: 0.9283 - loss: 0.5721 - val_attri

In [57]:
# Evaluate the model with the testing data
results = model.evaluate(X_test_scaled, [y_test_department, y_test_attrition])

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8329 - department_output_accuracy: 0.9539 - loss: 0.5214 


In [68]:
# Evaluate the model with the testing data
results = model.evaluate(X_test_scaled, [y_test_department, y_test_attrition])

# Print the loss and metrics for both department and attrition
print("Results:", results)  # Print the entire results list to inspect its contents

# Accessing the values based on their position and meaning in the list.
department_loss = results[0]
attrition_loss = results[1]
department_accuracy = results[2]
attrition_accuracy = results[2]

print(f"Department Loss: {department_loss}")
print(f"Attrition Loss: {attrition_loss}")
print(f"Department Accuracy: {department_accuracy}")
print(f"Attrition Accuracy: {attrition_accuracy}")

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8329 - department_output_accuracy: 0.9539 - loss: 0.5214 
Results: [0.5276601314544678, 0.8469387888908386, 0.9455782175064087]
Department Loss: 0.5276601314544678
Attrition Loss: 0.8469387888908386
Department Accuracy: 0.9455782175064087
Attrition Accuracy: 0.9455782175064087


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. No, there may be an imbalance in the data. If one variable has significantly more data, accuracy may not reflect the model correctly.
2. The chosen activation was softmax for departments and sigmoid for attrition. Softmax is suitable for multi-class classification. This allows for multiple classes and probabilities to sum to 1. Sigmoid is ideal for binary classifications. Outputs between 0 and 1 for each class aligns with its implementation.
3. Improvements to the model can be achieved by:
* Counterbalancing Class: SMOTE would help train the data.
* Features: Having more relevant features to capture underlying patterns.
* Cross-Validation: Use k-fold cross validation. This would help the model's performance is consistent.