## Part 1: Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [None]:
attrition_df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EnvironmentSatisfaction      int64
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
NumCompaniesWorked           int64
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [None]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [None]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[["Attrition", "Department"]]


In [None]:
# Create a list of at least 10 column names to use as X data
X_columns = ["Age", "Education", "HourlyRate", "JobRole", "TotalWorkingYears", "OverTime", "DistanceFromHome", "EnvironmentSatisfaction", "YearsSinceLastPromotion", "YearsWithCurrManager"]


# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
X_df.dtypes


Age                         int64
Education                   int64
HourlyRate                  int64
JobRole                    object
TotalWorkingYears           int64
OverTime                   object
DistanceFromHome            int64
EnvironmentSatisfaction     int64
YearsSinceLastPromotion     int64
YearsWithCurrManager        int64
dtype: object

In [None]:
X_df.columns

Index(['Age', 'Education', 'HourlyRate', 'JobRole', 'TotalWorkingYears',
       'OverTime', 'DistanceFromHome', 'EnvironmentSatisfaction',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [None]:
X_df_clean = X_df.dropna().copy()

# X_df_clean.loc[:, "OverTime"] = pd.get_dummies(X_df["OverTime"], drop_first=True)
# X_df_clean.loc[:, "JobRole"] = pd.get_dummies(X_df["JobRole"], drop_first=True)

# X_df_clean.info()

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df_clean, y_df, test_size=0.2, random_state=42)

In [None]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['OverTime', 'JobRole']
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
ohe.fit(np.array(X_train['OverTime']).reshape(-1, 1))

X_train['OverTimeEncoded'] = ohe.transform(np.array(X_train['OverTime']).reshape(-1, 1))
X_test['OverTimeEncoded'] = ohe.transform(np.array(X_test['OverTime']).reshape(-1, 1))

In [None]:
X_train = X_train.drop('OverTime', axis=1)
X_test = X_test.drop('OverTime', axis=1)
print(X_test.dtypes)

Age                          int64
Education                    int64
HourlyRate                   int64
JobRole                     object
TotalWorkingYears            int64
DistanceFromHome             int64
EnvironmentSatisfaction      int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
OverTimeEncoded            float64
dtype: object


In [None]:

ohe2 = OneHotEncoder(sparse_output=False, drop=None, handle_unknown='ignore')
ohe2.fit(np.array(X_train['JobRole']).reshape(-1, 1))

# Transform the 'JobRole' column in both training and testing data
jobrole_encoded_train = ohe2.transform(np.array(X_train['JobRole']).reshape(-1, 1))
jobrole_encoded_test = ohe2.transform(np.array(X_test['JobRole']).reshape(-1, 1))
# Convert the encoded arrays to DataFrames with appropriate column names
jobrole_encoded_train_df = pd.DataFrame(jobrole_encoded_train, columns=ohe2.get_feature_names_out(['JobRole']))
jobrole_encoded_test_df = pd.DataFrame(jobrole_encoded_test, columns=ohe2.get_feature_names_out(['JobRole']))
# Add the encoded columns to the original DataFrames
X_train = X_train.join(jobrole_encoded_train_df)
X_test = X_test.join(jobrole_encoded_test_df)
# Drop the original 'JobRole' column
X_train = X_train.drop(columns='JobRole')
X_test = X_test.drop(columns='JobRole')
# Print the data types of columns in both datasets
print(X_train.dtypes)


Age                                    int64
Education                              int64
HourlyRate                             int64
TotalWorkingYears                      int64
DistanceFromHome                       int64
EnvironmentSatisfaction                int64
YearsSinceLastPromotion                int64
YearsWithCurrManager                   int64
OverTimeEncoded                      float64
JobRole_Healthcare Representative    float64
JobRole_Human Resources              float64
JobRole_Laboratory Technician        float64
JobRole_Manager                      float64
JobRole_Manufacturing Director       float64
JobRole_Research Director            float64
JobRole_Research Scientist           float64
JobRole_Sales Executive              float64
JobRole_Sales Representative         float64
dtype: object


In [None]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# from re import X
# np.random.seed(42)
# X_train, X_test, y_train, y_test = train_test_split(transformed_X, y_df, test_size=0.2, random_state=42)

In [None]:
# Create a OneHotEncoder for the Department column
oheDepartment = ["Department"]

ohed = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)

# Fit the encoder to the training data
ohed.fit(np.array(y_train['Department']).reshape(-1, 1))

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded = ohed.transform(y_train[oheDepartment])
y_test_encoded = ohed.transform(y_test[oheDepartment])









In [None]:
# Create a OneHotEncoder for the Attrition column
oheAttrition = ["Attrition"]

ohea = OneHotEncoder(sparse=False, drop='first')

# Fit the encoder to the training data
y_attrition_encoded = ohea.fit_transform(y_df[oheAttrition])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded = ohea.transform(y_train[oheAttrition])
y_test_encoded = ohea.transform(y_test[oheAttrition])





## Create, Compile, and Train the Model

In [None]:
# Find the number of columns in the X training data
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense

num_columns = X_train.shape[1]


# Create the input layer
input_layer = Input(shape=(19,))


# Create at least two shared layers
shared_layer = Dense(64, activation='relu')
shared_output_1 = shared_layer(input_layer)
shared_output_2 = shared_layer(input_layer)

print(shared_output_1)
print(shared_output_2)

KerasTensor(type_spec=TensorSpec(shape=(None, 64), dtype=tf.float32, name=None), name='dense/Relu:0', description="created by layer 'dense'")
KerasTensor(type_spec=TensorSpec(shape=(None, 64), dtype=tf.float32, name=None), name='dense/Relu:0', description="created by layer 'dense'")


In [None]:
# Create a branch for Department
# with a hidden layer and an output layer
import tensorflow as tf

dep_main_input = tf.keras.layers.Input(shape=(num_columns,))

dep_hidden_layer = tf.keras.layers.Dense(64, activation='relu')(dep_main_input)
dep_output_layer = tf.keras.layers.Dense(2, activation='softmax')(dep_hidden_layer)

department_model = tf.keras.Model(inputs=dep_main_input, outputs=dep_output_layer)

# Create the hidden layer


# Create the output layer



In [None]:
# Create a branch for Attrition
# with a hidden layer and an output layer
attr_main_input = tf.keras.layers.Input(shape=(num_columns,))

atrr_hidden_layer = tf.keras.layers.Dense(64, activation='relu')(attr_main_input)
attr_output_layer = tf.keras.layers.Dense(2, activation='softmax')(atrr_hidden_layer)

attrition_model = tf.keras.Model(inputs=attr_main_input, outputs=attr_output_layer)

# Create the hidden layer


# Create the output layer



In [None]:
# Create the model
model = tf.keras.Model(inputs=[dep_main_input, attr_main_input], outputs=[dep_output_layer, attr_output_layer])


# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summarize the model
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 18)]                 0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 18)]                 0         []                            
                                                                                                  
 dense_1 (Dense)             (None, 64)                   1216      ['input_2[0][0]']             
                                                                                                  
 dense_3 (Dense)             (None, 64)                   1216      ['input_3[0][0]']             
                                                                                            

In [None]:
# Train the data
model.fit([X_train_scaled, X_train_scaled], [y_train_encoded, y_train_encoded],
          epochs=10, batch_size=32,
          validation_data=([X_test_scaled, X_test_scaled], [y_test_encoded, y_test_encoded]))

Epoch 1/10

ValueError: Unexpected result of `train_function` (Empty logs). This could be due to issues in input pipeline that resulted in an empty dataset. Otherwise, please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.

In [None]:
# Evaluate the model with the testing data


In [None]:
# Print the accuracy for both department and attrition


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. I was unable to figure out the code to train the model but I am not sure if accuracy would be a good metric to use on this data. Looking closely at the data, it seems like there are a lot of factors that lead to a negative outcome that will affect the model's accuracy.
2.
3. The model could be improved by further cleaning of the data to clearify some of the metrics used to to compile the data. While columns like "distance from home" and "education" may seem like a good measurement of attrition, it does not take into account some of the nuances of life that affect happiness in a job.