# Feature Engineering for Employee Attrition

## Objective
Transform the cleaned HR attrition dataset into a model-ready feature set
by encoding domain-driven hypotheses derived from EDA.

This notebook:
- Produces a reusable processed dataset for downstream modeling

In [6]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

## 1. Load Cleaned Dataset

This dataset is the output of the data audit stage and serves as the input
for feature engineering.

In [None]:
data_path = "../data/interim/hr_employee_attrition_audit_clean.csv"

df = pd.read_csv(data_path)

df.shape

(1470, 31)

In [8]:
df.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Yes,11,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,No,23,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Yes,15,3,2,0,7,3,3,0,0,0,0


## 2. Target Definition

Attrition is converted into a binary target variable for modeling.

In [9]:
df["AttritionFlag"] = (df["Attrition"] == "Yes").astype(int)

df["AttritionFlag"].value_counts(), df["AttritionFlag"].mean()

(AttritionFlag
 0    1233
 1     237
 Name: count, dtype: int64,
 np.float64(0.16122448979591836))

## 3. Feature Engineering

Features are engineered based on EDA findings, including:
- Income, age, experience, tenure, and distance buckets
- Workload and travel indicators
- Interaction features capturing combined risk
- Composite engagement index

In [None]:
class FeatureEngineer:
    def __init__(self):
        self.cutpoints_ = {}

    def fit(self, X):
        self.cutpoints_["income"] = np.quantile(
            X["MonthlyIncome"], [0, 0.25, 0.5, 0.75, 1.0]
        )
        self.cutpoints_["age"] = np.quantile(X["Age"], [0, 0.25, 0.5, 0.75, 1.0])
        self.cutpoints_["experience"] = np.quantile(
            X["TotalWorkingYears"], [0, 0.25, 0.5, 0.75, 1.0]
        )
        self.cutpoints_["tenure"] = np.quantile(
            X["YearsAtCompany"], [0, 0.25, 0.5, 0.75, 1.0]
        )
        self.cutpoints_["distance"] = np.quantile(
            X["DistanceFromHome"], [0, 0.25, 0.5, 0.75, 1.0]
        )
        return self

    def transform(self, X):
        X = X.copy()

        # Quartiles
        X["income_quartile"] = pd.cut(
            X["MonthlyIncome"],
            bins=self.cutpoints_["income"],
            labels=False,
            include_lowest=True,
        )
        X["age_quartile"] = pd.cut(
            X["Age"], bins=self.cutpoints_["age"], labels=False, include_lowest=True
        )
        X["exp_quartile"] = pd.cut(
            X["TotalWorkingYears"],
            bins=self.cutpoints_["experience"],
            labels=False,
            include_lowest=True,
        )
        X["tenure_quartile"] = pd.cut(
            X["YearsAtCompany"],
            bins=self.cutpoints_["tenure"],
            labels=False,
            include_lowest=True,
        )
        X["distance_quartile"] = pd.cut(
            X["DistanceFromHome"],
            bins=self.cutpoints_["distance"],
            labels=False,
            include_lowest=True,
        )

        # Binary flags
        X["overtime_flag"] = (X["OverTime"] == "Yes").astype(int)
        X["frequent_travel_flag"] = (X["BusinessTravel"] == "Travel_Frequently").astype(
            int
        )
        X["is_single_flag"] = (X["MaritalStatus"] == "Single").astype(int)

        X["low_income_flag"] = (X["income_quartile"] == 0).astype(int)
        X["early_tenure_flag"] = (X["tenure_quartile"] == 0).astype(int)
        X["new_hire_flag"] = (X["YearsAtCompany"] <= 1).astype(int)
        X["long_distance_flag"] = (X["distance_quartile"] == 3).astype(int)

        # Interactions
        X["low_income_and_overtime"] = X["low_income_flag"] * X["overtime_flag"]
        X["early_tenure_and_overtime"] = X["early_tenure_flag"] * X["overtime_flag"]
        X["new_hire_and_overtime"] = X["new_hire_flag"] * X["overtime_flag"]
        X["frequent_travel_and_overtime"] = (
            X["frequent_travel_flag"] * X["overtime_flag"]
        )

        # Travel intensity (ordinal)
        travel_map = {
            "Non-Travel": 0,
            "Travel_Rarely": 1,
            "Travel_Frequently": 2,
        }
        X["travel_intensity"] = X["BusinessTravel"].map(travel_map).astype(int)

        # Engagement index
        engagement_cols = [
            "JobSatisfaction",
            "EnvironmentSatisfaction",
            "RelationshipSatisfaction",
            "WorkLifeBalance",
            "JobInvolvement",
        ]
        X["engagement_index"] = X[engagement_cols].mean(axis=1)

        return X

In [12]:
X = df.drop(columns=["Attrition", "AttritionFlag"])

fe = FeatureEngineer()
fe.fit(X)

X_fe = fe.transform(X)

X_fe.shape

(1470, 48)

In [13]:
X_fe.head(3)

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,income_quartile,age_quartile,exp_quartile,tenure_quartile,distance_quartile,overtime_flag,frequent_travel_flag,is_single_flag,low_income_flag,early_tenure_flag,new_hire_flag,long_distance_flag,low_income_and_overtime,early_tenure_and_overtime,new_hire_and_overtime,frequent_travel_and_overtime,travel_intensity,engagement_index
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Yes,11,3,1,0,8,0,1,6,4,0,5,2,2,1,2,0,1,0,1,0,0,0,0,0,0,0,0,1,2.2
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,No,23,4,4,1,10,3,3,10,7,1,7,2,3,1,3,2,0,1,0,0,0,0,0,0,0,0,0,2,2.8
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Yes,15,3,2,0,7,3,3,0,0,0,0,0,2,1,0,0,1,0,1,1,1,1,0,1,1,1,0,1,2.8


In [14]:
X_fe.isna().sum().sum(), X_fe.dtypes.value_counts()

(np.int64(0),
 int64      40
 object      7
 float64     1
 Name: count, dtype: int64)

In [15]:
final_features = X_fe.columns.tolist()

len(final_features), final_features

(48,
 ['Age',
  'BusinessTravel',
  'DailyRate',
  'Department',
  'DistanceFromHome',
  'Education',
  'EducationField',
  'EnvironmentSatisfaction',
  'Gender',
  'HourlyRate',
  'JobInvolvement',
  'JobLevel',
  'JobRole',
  'JobSatisfaction',
  'MaritalStatus',
  'MonthlyIncome',
  'MonthlyRate',
  'NumCompaniesWorked',
  'OverTime',
  'PercentSalaryHike',
  'PerformanceRating',
  'RelationshipSatisfaction',
  'StockOptionLevel',
  'TotalWorkingYears',
  'TrainingTimesLastYear',
  'WorkLifeBalance',
  'YearsAtCompany',
  'YearsInCurrentRole',
  'YearsSinceLastPromotion',
  'YearsWithCurrManager',
  'income_quartile',
  'age_quartile',
  'exp_quartile',
  'tenure_quartile',
  'distance_quartile',
  'overtime_flag',
  'frequent_travel_flag',
  'is_single_flag',
  'low_income_flag',
  'early_tenure_flag',
  'new_hire_flag',
  'long_distance_flag',
  'low_income_and_overtime',
  'early_tenure_and_overtime',
  'new_hire_and_overtime',
  'frequent_travel_and_overtime',
  'travel_intensit

In [17]:
df_processed = X_fe.copy()
df_processed["AttritionFlag"] = df["AttritionFlag"].values

processed_path = "../data/processed/hr_employee_attrition_features.csv"
df_processed.to_csv(processed_path, index=False)

df_processed.shape, processed_path

((1470, 49), '../data/processed/hr_employee_attrition_features.csv')

## Feature Engineering Summary

This notebook produced a complete, leakage-safe, model-ready dataset by:
- Encoding domain-informed features derived from EDA
- Capturing non-linear and interaction effects
- Validating feature completeness and consistency

The resulting dataset is saved under `data/processed/` and will be used
as the single source of truth for all modeling experiments.