## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [3]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [5]:
# Create y_df with the Attrition and Department columns

# 
y_df = attrition_df[['Attrition', 'Department']]
# 
print(y_df)


     Attrition              Department
0          Yes                   Sales
1           No  Research & Development
2          Yes  Research & Development
3           No  Research & Development
4           No  Research & Development
...        ...                     ...
1465        No  Research & Development
1466        No  Research & Development
1467        No  Research & Development
1468        No                   Sales
1469        No  Research & Development

[1470 rows x 2 columns]


In [7]:
# Create a list of at least 10 column names to use as X data
selected_columns = [
    'Age',
    'BusinessTravel',
    'DistanceFromHome',
    'Education',
    'EnvironmentSatisfaction',
    'HourlyRate',
    'JobLevel',
    'JobSatisfaction',
    'MaritalStatus',
    'YearsAtCompany',
    'Department'
]

# Create X_df using your selected columns
X_df = attrition_df[selected_columns]

# Show the data types for X_df
print("X_df:")
print(X_df)
print("\nData types:")
print(X_df.dtypes)


X_df:
      Age     BusinessTravel  DistanceFromHome  Education  \
0      41      Travel_Rarely                 1          2   
1      49  Travel_Frequently                 8          1   
2      37      Travel_Rarely                 2          2   
3      33  Travel_Frequently                 3          4   
4      27      Travel_Rarely                 2          1   
...   ...                ...               ...        ...   
1465   36  Travel_Frequently                23          2   
1466   39      Travel_Rarely                 6          1   
1467   27      Travel_Rarely                 4          3   
1468   49  Travel_Frequently                 2          3   
1469   34      Travel_Rarely                 8          3   

      EnvironmentSatisfaction  HourlyRate  JobLevel  JobSatisfaction  \
0                           2          94         2                4   
1                           3          61         2                2   
2                           4          92    

In [9]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
X = X_df  # Features DataFrame
y = attrition_df['Attrition']  # Target column

# Split the data into training and testing sets
# 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the results
print("X_train:")
print(X_train)
print("\nX_test:")
print(X_test)
print("\ny_train:")
print(y_train)
print("\ny_test:")
print(y_test)


X_train:
      Age     BusinessTravel  DistanceFromHome  Education  \
1097   24      Travel_Rarely                21          2   
727    18         Non-Travel                 5          2   
254    29      Travel_Rarely                20          2   
1175   39      Travel_Rarely                12          3   
1341   31      Travel_Rarely                20          3   
...   ...                ...               ...        ...   
1130   35      Travel_Rarely                28          3   
1294   41      Travel_Rarely                 5          3   
860    22  Travel_Frequently                 3          4   
1459   29      Travel_Rarely                13          2   
1126   50      Travel_Rarely                 9          3   

      EnvironmentSatisfaction  HourlyRate  JobLevel  JobSatisfaction  \
1097                        3          57         1                1   
727                         2          73         1                4   
254                         4          45 

In [11]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Check data types of X_df
print("Initial data types:")
print(X_train.dtypes)

# Example: Identify non-numeric columns
categorical_columns = X_train.select_dtypes(include=['object']).columns

# Apply one-hot encoding to categorical columns (if any)
if not categorical_columns.empty:
    encoder = OneHotEncoder(sparse=False, drop='first')  # Drop first to avoid multicollinearity
    encoded_df = pd.DataFrame(
        encoder.fit_transform(X_train[categorical_columns]),
        columns=encoder.get_feature_names_out(categorical_columns),
        index=X_train.index
    )
    # Drop original categorical columns and merge the encoded columns
    X_train = X_train.drop(categorical_columns, axis=1).join(encoded_df)

# Ensure all columns are numeric
X_train = X_train.apply(pd.to_numeric, errors='coerce')

# Display the final X_df
print("\nConverted X_train:")
print(X_train)
# Confirm all columns are numeric
print("\nFinal data types:")
print(X_train.dtypes)

####################################
# Check data types of X_df
print("Initial data types:")
print(X_test.dtypes)

# Example: Identify non-numeric columns
categorical_columns = X_test.select_dtypes(include=['object']).columns

# Apply one-hot encoding to categorical columns (if any)
if not categorical_columns.empty:
    encoder = OneHotEncoder(sparse=False, drop='first')  # Drop first to avoid multicollinearity
    encoded_df = pd.DataFrame(
        encoder.fit_transform(X_test[categorical_columns]),
        columns=encoder.get_feature_names_out(categorical_columns),
        index=X_test.index
    )
    # Drop original categorical columns and merge the encoded columns
    X_test = X_test.drop(categorical_columns, axis=1).join(encoded_df)

# Ensure all columns are numeric
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Display the final X_df
print("\nConverted X_train:")
print(X_test)
# Confirm all columns are numeric
print("\nFinal data types:")
print(X_test.dtypes)


Initial data types:
Age                         int64
BusinessTravel             object
DistanceFromHome            int64
Education                   int64
EnvironmentSatisfaction     int64
HourlyRate                  int64
JobLevel                    int64
JobSatisfaction             int64
MaritalStatus              object
YearsAtCompany              int64
Department                 object
dtype: object

Converted X_train:
      Age  DistanceFromHome  Education  EnvironmentSatisfaction  HourlyRate  \
1097   24                21          2                        3          57   
727    18                 5          2                        2          73   
254    29                20          2                        4          45   
1175   39                12          3                        4          66   
1341   31                20          3                        2          89   
...   ...               ...        ...                      ...         ...   
1130   35          



In [13]:
# Create a StandardScaler
from sklearn.preprocessing import StandardScaler
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display scaled data
print("Scaled X_train:")
print(X_train_scaled)
print("\nScaled X_test:")
print(X_test_scaled)


Scaled X_train:
[[-1.38855944  1.44039645 -0.86335572 ... -0.66288195  0.73297674
  -0.66554097]
 [-2.04073779 -0.52269928 -0.86335572 ...  1.50856422  0.73297674
  -0.66554097]
 [-0.84507748  1.31770296 -0.86335572 ... -0.66288195 -1.36429977
   1.5025371 ]
 ...
 [-1.60595222 -0.76808624  1.06322176 ... -0.66288195  0.73297674
  -0.66554097]
 [-0.84507748  0.45884859 -0.86335572 ... -0.66288195  0.73297674
  -0.66554097]
 [ 1.43754676 -0.03192534  0.09993302 ... -0.66288195 -1.36429977
   1.5025371 ]]

Scaled X_test:
[[-0.95377387 -0.52269928  0.09993302 ...  1.50856422 -1.36429977
   1.5025371 ]
 [ 1.76363593  0.45884859 -0.86335572 ... -0.66288195  0.73297674
  -0.66554097]
 [-1.38855944  1.56308993 -1.82664446 ... -0.66288195 -1.36429977
  -0.66554097]
 ...
 [ 1.43754676  2.29925083  0.09993302 ... -0.66288195  0.73297674
  -0.66554097]
 [ 0.02449366  0.09076814 -0.86335572 ...  1.50856422  0.73297674
  -0.66554097]
 [ 1.11145758 -0.89077972 -0.86335572 ... -0.66288195 -1.36429977


In [15]:
# Create a OneHotEncoder for the Attrition column
# Fit the encoder to the training data
# Create two new variables by applying the encoder
# to the training and testing data
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Create a OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid the dummy variable trap

# Fit the encoder to the training data (Attrition column)
encoder.fit(y_train.values.reshape(-1, 1))

# Apply the encoder to the training and testing data
y_train_encoded = encoder.transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Convert the encoded data to DataFrame for clarity
encoded_columns = encoder.get_feature_names_out(['Attrition'])
y_train_encoded_df = pd.DataFrame(y_train_encoded, columns=encoded_columns, index=y_train.index)
y_test_encoded_df = pd.DataFrame(y_test_encoded, columns=encoded_columns, index=y_test.index)

# Display the encoded training and testing data
print("Encoded Training Data:")
print(y_train_encoded_df)

print("\nEncoded Testing Data:")
print(y_test_encoded_df)


Encoded Training Data:
      Attrition_Yes
1097            0.0
727             0.0
254             0.0
1175            0.0
1341            0.0
...             ...
1130            0.0
1294            0.0
860             1.0
1459            0.0
1126            0.0

[1176 rows x 1 columns]

Encoded Testing Data:
      Attrition_Yes
1041            0.0
184             0.0
1222            1.0
67              0.0
220             0.0
...             ...
567             0.0
560             0.0
945             0.0
522             0.0
651             0.0

[294 rows x 1 columns]




In [17]:
# Create a OneHotEncoder for the Department column
# Fit the encoder to the training data
# Create two new variables by applying the encoder
# to the training and testing data

# Already encoded

KeyError: "None of [Index(['Department'], dtype='object')] are in the [columns]"

## Create, Compile, and Train the Model

In [21]:
# Find the number of columns in the X training data
# Create the input layer
# Create at least two shared layers

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input

# Find the number of columns in the X training data
num_features = X_train_scaled.shape[1]  # Number of columns/features
print(f"Number of features in X training data: {num_features}")

# Create the input layer
input_layer = Input(shape=(num_features,))

# Create at least two shared layers
shared_layer_1 = Dense(units=64, activation='relu')(input_layer)  # 64 neurons, ReLU activation
shared_layer_2 = Dense(units=32, activation='relu')(shared_layer_1)  # 32 neurons, ReLU activation

# Example: Adding an output layer (binary classification)
output_layer = Dense(units=1, activation='sigmoid')(shared_layer_2)  # Sigmoid for binary classification

# Create the model
model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

# Display the model summary
model.summary()


Number of features in X training data: 14


In [27]:
# Create a branch for Department
# with a hidden layer and an output layer
# Create the hidden layer
# Create the output layer

from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.models import Model

# Input for Department branch
department_input = Input(shape=(X_train.shape[1],), name="Department_Input")

# Create the hidden layer for Department branch
department_hidden_layer = Dense(units=16, activation="relu", name="Department_Hidden")(department_input)

# Create the output layer for Department branch
department_output_layer = Dense(units=8, activation="relu", name="Department_Output")(department_hidden_layer)

# Create the full Department branch model
department_branch_model = Model(inputs=department_input, outputs=department_output_layer, name="Department_Branch")

# Summary of the Department branch model
department_branch_model.summary()




In [31]:
# Create a branch for Attrition
# with a hidden layer and an output layer
# Create the hidden layer
# Create the output layer

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Input for Attrition branch
attrition_input = Input(shape=(y_train.shape[0],), name="Attrition_Input")

# Create the hidden layer for Attrition branch
attrition_hidden_layer = Dense(units=16, activation="relu", name="Attrition_Hidden")(attrition_input)

# Create the output layer for Attrition branch
attrition_output_layer = Dense(units=8, activation="relu", name="Attrition_Output")(attrition_hidden_layer)

# Create the full Attrition branch model
attrition_branch_model = Model(inputs=attrition_input, outputs=attrition_output_layer, name="Attrition_Branch")

# Summary of the Attrition branch model
attrition_branch_model.summary()


In [33]:
# Create the model
# Compile the model
# Summarize the model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Concatenate, Dense

# Combine the Department and Attrition branches with the shared layers
combined = Concatenate(name="Combined_Layer")([shared_layer_2, department_output_layer, attrition_output_layer])

# Add the final output layer for classification
final_output = Dense(units=1, activation="sigmoid", name="Final_Output")(combined)

# Create the full model
full_model = Model(inputs=[input_layer, department_input, attrition_input], outputs=final_output, name="Full_Model")

# Compile the model
full_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Summarize the model
full_model.summary()


In [43]:
# Train the model
# Prepare the inputs for the model
# X_train_scaled: Shared feature inputs
# X_train_encoded_df: Encoded Department inputs
# y_train_encoded_df: Encoded Attrition inputs

# Combine the inputs into a list for multi-input model
inputs_train = [X_train_scaled, X_train_encoded, y_train_encoded]

# Train the model
history = full_model.fit(
    X_train,  # Inputs to the model
    y_train,       # Target variable
    validation_split=0.2,  # Use 20% of training data for validation
    epochs=20,             # Number of epochs
    batch_size=32,         # Batch size
    verbose=1              # Display training progress
)

# Display training history
print("\nTraining complete. Model has been trained.")



NameError: name 'X_train_encoded' is not defined

In [None]:
# Evaluate the model with the testing data


In [None]:
# Print the accuracy for both department and attrition


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. 
2. 
3. 