In [2]:
"""
Employee Attrition Analysis Pipeline

This script implements a complete machine learning pipeline to predict employee attrition
using Logistic Regression, Random Forest, and XGBoost classifiers on the IBM Employee Attrition dataset.
It covers data loading, preprocessing (including standard scaling and one-hot encoding), model training,
evaluation, and feature importance extraction for tree-based classifiers.

PEP8 standards have been applied for code readability and consistency.
"""

# Importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


In [5]:

# Load Dataset
data = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.xls')
data


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [6]:

# Define Target Variable
y = data['Attrition'].map({'Yes': 1, 'No': 0})
X = data.drop(columns=['Attrition', 'EmployeeNumber', 'Over18', 'StandardHours'])

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Preprocessing Setup
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


In [8]:

# Define Models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42,
                             use_label_encoder=False, eval_metric='logloss')
}

# Training and Evaluation
for name, model in models.items():
    clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])
    clf_pipeline.fit(X_train, y_train)
    y_pred = clf_pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Model Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))

    if name in ['Random Forest', 'XGBoost']:
        # Feature Importance (for tree-based models)
        feature_names = numerical_cols.tolist() + list(
            clf_pipeline.named_steps['preprocessor']
            .named_transformers_['cat']
            .get_feature_names_out(categorical_cols)
        )
        importances = model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        })
        top_features = feature_importance_df.sort_values(
            by='Importance', ascending=False
        ).head(10)
        print(f"Top 10 Features for {name} Model:\n", top_features, "\n")



Logistic Regression Model Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       370
           1       0.71      0.42      0.53        71

    accuracy                           0.88       441
   macro avg       0.81      0.70      0.73       441
weighted avg       0.87      0.88      0.87       441


Random Forest Model Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       370
           1       0.52      0.17      0.26        71

    accuracy                           0.84       441
   macro avg       0.69      0.57      0.58       441
weighted avg       0.80      0.84      0.81       441

Top 10 Features for Random Forest Model:
                  Feature  Importance
10         MonthlyIncome    0.070930
0                    Age    0.067780
17     TotalWorkingYears    0.060291
1              DailyRate    0.047996
11           MonthlyRate    0.045685
6   

Parameters: { "use_label_encoder" } are not used.



AttributeError: 'super' object has no attribute '__sklearn_tags__'