# IBM HR Analytics Employee Attrition & Performance

In [1]:
# imports
import pandas as pd

### Load Data

In [2]:

# Load the dataset
file_path = "data/IBM HR Analytics Employee Attrition & Performance/WA_Fn-UseC_-HR-Employee-Attrition.csv"
data = pd.read_csv(file_path)
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


Target Variable: Atrrition

### Preprocessing

In [3]:
data.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [5]:
# Convert target column to binary
data['Attrition'] = LabelEncoder().fit_transform(data['Attrition'])  # 0 = No, 1 = Yes

# One-hot encode categorical features
data = pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,41,1,1102,1,2,1,1,2,94,3,...,False,False,False,False,False,True,False,False,True,True
1,49,0,279,8,1,1,2,3,61,2,...,False,False,False,False,True,False,False,True,False,False
2,37,1,1373,2,2,1,4,4,92,2,...,True,False,False,False,False,False,False,False,True,True
3,33,0,1392,3,4,1,5,4,56,3,...,False,False,False,False,True,False,False,True,False,True
4,27,0,591,2,1,1,7,1,40,3,...,True,False,False,False,False,False,False,True,False,False


In [6]:
# Split into features (X) and target (y)
X = data.drop('Attrition', axis=1)
y = data['Attrition']

In [7]:

# Normalize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Training

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [10]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [11]:
# Create DataLoader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

In [19]:
import torch.nn as nn

class BasicNN(nn.Module):
    def __init__(self, input_size, output_size=1):
        super(BasicNN, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.ReLU(),

            nn.Linear(32, 16),
            nn.ReLU(),

            nn.Linear(16, output_size),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

In [20]:
# Initialize model, loss, and optimizer
input_size = X_train.shape[1]
output_size = 1 # for binary classification

model = BasicNN(input_size, output_size)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
# Training loop
epochs = 20
for epoch in range(epochs):
    for batch_X, batch_y in train_loader:
        batch_y = batch_y.view(-1, 1)  # Reshape target
        predictions = model(batch_X)
        loss = criterion(predictions, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 1/20, Loss: 0.5949
Epoch 2/20, Loss: 0.5099
Epoch 3/20, Loss: 0.3496
Epoch 4/20, Loss: 0.4052
Epoch 5/20, Loss: 0.2494
Epoch 6/20, Loss: 0.2318
Epoch 7/20, Loss: 0.2817
Epoch 8/20, Loss: 0.2996
Epoch 9/20, Loss: 0.1510
Epoch 10/20, Loss: 0.2228
Epoch 11/20, Loss: 0.1102
Epoch 12/20, Loss: 0.0746
Epoch 13/20, Loss: 0.1978
Epoch 14/20, Loss: 0.2716
Epoch 15/20, Loss: 0.2523
Epoch 16/20, Loss: 0.1370
Epoch 17/20, Loss: 0.3136
Epoch 18/20, Loss: 0.2495
Epoch 19/20, Loss: 0.1420
Epoch 20/20, Loss: 0.2404


### Evaluation

In [22]:
from sklearn.metrics import accuracy_score

In [23]:

# Switch the model to evaluation mode
model.eval()

BasicNN(
  (model): Sequential(
    (0): Linear(in_features=47, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=1, bias=True)
    (5): Sigmoid()
  )
)

In [24]:
# Disable gradient computation during testing
with torch.no_grad():
    # Predict on the test set
    test_predictions = model(X_test_tensor)
    test_predictions = (test_predictions > 0.5).float()  # Convert probabilities to binary (0 or 1)

    # Calculate accuracy
    test_accuracy = accuracy_score(y_test_tensor, test_predictions)
    print(f"Test Accuracy: {test_accuracy:.4f}")

    # Calculate loss
    test_loss = criterion(test_predictions, y_test_tensor.view(-1, 1))
    print(f"Test Loss: {test_loss.item():.4f}")

Test Accuracy: 0.8878
Test Loss: 11.2245
