# Using a Logistic Regression Model to predict whether an IBM employee will stay or leave their job

In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = "../Resources/IBM-HR-Employee-Attrition_data.csv"
employee_data_df = pd.read_csv(file_path)

# Review the DataFrame
employee_data_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


## Preparing the Data for Modeling

In [3]:
# Only look in the dataframe for current workers
symbol_mapping = {"Yes": "1", "No": "0"}

employee_data_df["Attrition"] = employee_data_df["Attrition"].replace(symbol_mapping)

employee_data_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = employee_data_df["Attrition"]

# Separate the X variable, the features
X = employee_data_df.drop(columns = ["Attrition"])

In [5]:
# Convert categorical data to numeric with `pd.get_dummies`
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,41,1102,1,2,1,1,2,94,3,2,...,0,0,1,0,0,0,1,1,0,1
1,49,279,8,1,1,2,3,61,2,2,...,0,1,0,0,0,1,0,1,1,0
2,37,1373,2,2,1,4,4,92,2,1,...,0,0,0,0,0,0,1,1,0,1
3,33,1392,3,4,1,5,4,56,3,1,...,0,1,0,0,0,1,0,1,0,1
4,27,591,2,1,1,7,1,40,3,1,...,0,0,0,0,0,1,0,1,1,0


In [6]:
y.value_counts()

0    1233
1     237
Name: Attrition, dtype: int64

This is an unbalanced dataset, since we expect in general more people to stay at the jobs than quit. We'll try the model using the original set and then use a random oversampler if the accuracy is bad

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [8]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [10]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
results.head()

Unnamed: 0,Prediction,Actual
279,0,0
837,0,0
1165,1,0
57,1,0
1196,0,0


In [11]:
# Print the balanced_accuracy score of the model
accuracy = accuracy_score(y_test, predictions)
balanced_accuracy = balanced_accuracy_score(y_test, predictions)
print(f"The accuracy score for this model is {round(accuracy*100, 2)}%")
print(f"The balanced accuracy score for this model is {round(balanced_accuracy*100, 2)}%")

The accuracy score for this model is 87.23%
The balanced accuracy score for this model is 71.14%


In [12]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Stayed", "Left"], columns=["Predicted Stayed", "Predicted Left"]
)

cm_df

Unnamed: 0,Predicted Stayed,Predicted Left
Stayed,293,16
Left,31,28


In [13]:
# Print the classification report for the model
classification_report_output = classification_report(y_test, predictions)
print(classification_report_output)

              precision    recall  f1-score   support

           0       0.90      0.95      0.93       309
           1       0.64      0.47      0.54        59

    accuracy                           0.87       368
   macro avg       0.77      0.71      0.73       368
weighted avg       0.86      0.87      0.86       368



This model clearly isn't very good, since the balanced accuracy is low and the precision and recall on predicting who left are also poor.

## Using an oversampled dataset

In this section, we will oversample the data to make it more balanced and see if that will improve our accuracy

In [14]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_over, y_over = ros.fit_resample(X_train, y_train)

In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_over_scaler = scaler.fit(X_over)

# Scale the data
X_train_scaled = X_over_scaler.transform(X_over)
X_test = X_over_scaler.transform(X_test)

In [16]:
# Count the distinct values of the resampled labels data
y_over.value_counts()

1    924
0    924
Name: Attrition, dtype: int64

In [18]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier_ros = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
classifier_ros.fit(X_train_scaled, y_over)

# Make a prediction using the testing data
predictions_ros = classifier_ros.predict(X_test)
results = pd.DataFrame({"Prediction": predictions_ros, "Actual": y_test})
results.head()

Unnamed: 0,Prediction,Actual
279,0,0
837,0,0
1165,1,0
57,1,0
1196,0,0


In [19]:
# Print the balanced_accuracy score of the model 
balanced_accuracy = balanced_accuracy_score(y_test, predictions_ros)

accuracy = accuracy_score(y_test, predictions_ros)

print(f"The balanced accuracy score for this model is {round(balanced_accuracy*100, 2)}%")
print(f"The accuracy score for this model is {round(accuracy*100, 2)}%")

The balanced accuracy score for this model is 75.11%
The accuracy score for this model is 72.01%


In [22]:
# Generate a confusion matrix for the model
cm_ros = confusion_matrix(y_test, predictions_ros)

cm_ros_df = pd.DataFrame(
    cm_ros, index=["Stayed", "Left"], columns=["Predicted Stayed", "Predicted Left"]
)

cm_ros_df

Unnamed: 0,Predicted Stayed,Predicted Left
Stayed,218,91
Left,12,47


In [23]:
# Print the classification report for the model
classification_report_ros = classification_report(y_test, predictions_ros)
print(classification_report_ros)

              precision    recall  f1-score   support

           0       0.95      0.71      0.81       309
           1       0.34      0.80      0.48        59

    accuracy                           0.72       368
   macro avg       0.64      0.75      0.64       368
weighted avg       0.85      0.72      0.76       368



This is still not very good. The accuracy and balanced accuracy here are both only 75 and 72%. The precision for the remainers is still high (95%) and the recall of the leavers is better (80%) but this improvement comes at the cost of a lower recall for remainers (71%) and very low precision of leavers (34%).